Which version of spark are you using? Looks like you are hitting the file handles. In that case you might want to increase the ulimit. You can actually validate this by looking in the worker logs (which would probably say Too many open files exception).
Thanks Best Regards On Thu, Aug 6, 2015 at 8:35 PM, Cat <caterina.gro...@dsp.io> wrote: > Hello, > > I am using the Python API to perform a grid search and train models using > LogisticRegressionWithSGD. > I am using r3.xl machines in EC2, running on top of YARN in cluster mode. > > The training RDD is persisted in memory and on disk. Some of the models > train successfully, but then at some point during the grid search I get an > error. It looks like the Python broadcast is looking for a part of the RDD > which is no longer there. I scanned the logs for further errors but could > not find anything. > > Any ideas of what could be causing this, and what should I be looking for? > > Many thanks. > Cat > > model = LogisticRegressionWithSGD.train(the_training, iterations=i, > regParam=c, miniBatchFraction=0.8) > File "/home/hadoop/spark/python/pyspark/mllib/classification.py", line > 164, in train > return _regression_train_wrapper(train, LogisticRegressionModel, data, > initialWeights) > File "/home/hadoop/spark/python/pyspark/mllib/regression.py", line 140, > in > _regression_train_wrapper > weights, intercept = train_func(data, > _convert_to_vector(initial_weights)) > File "/home/hadoop/spark/python/pyspark/mllib/classification.py", line > 162, in train > bool(intercept)) > File "/home/hadoop/spark/python/pyspark/mllib/common.py", line 120, in > callMLlibFunc > return callJavaFunc(sc, api, *args) > File "/home/hadoop/spark/python/pyspark/mllib/common.py", line 113, in > callJavaFunc > return _java2py(sc, func(*args)) > File > "/home/hadoop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", > line 538, in __call__ > self.target_id, self.name) > File > "/home/hadoop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line > 300, in get_return_value > format(target_id, '.', name), value) > Py4JJavaError: An error occurred while calling > o271.trainLogisticRegressionModelWithSGD. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task > serialization failed: java.io.FileNotFoundException: > > /mnt/spark/spark-b07b34f8-66c3-43ae-a3ed-0c291724409b/pyspark-4196e8e5-8024-4ec5-a7bb-a60b216e6e74/tmpbCjiSR > (No such file or directory) > java.io.FileInputStream.open(Native Method) > java.io.FileInputStream.<init>(FileInputStream.java:146) > > org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply$mcJ$sp(PythonRDD.scala:848) > > org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply(PythonRDD.scala:847) > > org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply(PythonRDD.scala:847) > org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1153) > > org.apache.spark.api.python.PythonBroadcast.writeObject(PythonRDD.scala:847) > sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) > > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > java.lang.reflect.Method.invoke(Method.java:606) > java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:988) > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1495) > > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) > > org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:44) > > org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:110) > > org.apache.spark.storage.BlockManager.dataSerializeStream(BlockManager.scala:1176) > org.apache.spark.storage.DiskStore.putIterator(DiskStore.scala:79) > org.apache.spark.storage.DiskStore.putArray(DiskStore.scala:64) > > org.apache.spark.storage.BlockManager.dropFromMemory(BlockManager.scala:1028) > > org.apache.spark.storage.MemoryStore$$anonfun$ensureFreeSpace$4.apply(MemoryStore.scala:419) > > org.apache.spark.storage.MemoryStore$$anonfun$ensureFreeSpace$4.apply(MemoryStore.scala:408) > > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) > org.apache.spark.storage.MemoryStore.ensureFreeSpace(MemoryStore.scala:408) > org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:263) > org.apache.spark.storage.MemoryStore.putIterator(MemoryStore.scala:136) > org.apache.spark.storage.MemoryStore.putIterator(MemoryStore.scala:114) > org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:786) > org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:637) > org.apache.spark.storage.BlockManager.putSingle(BlockManager.scala:991) > > org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:98) > > org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84) > > org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34) > > org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29) > > org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62) > org.apache.spark.SparkContext.broadcast(SparkContext.scala:1051) > org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:839) > org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:778) > > org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:762) > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1362) > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) > org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > > > > > -- > View this message in context: > http://apache-spark-user-list.1001560.n3.nabble.com/Temp-file-missing-when-training-logistic-regression-tp24153.html > Sent from the Apache Spark User List mailing list archive at Nabble.com. > > --------------------------------------------------------------------- > To unsubscribe, e-mail: user-unsubscr...@spark.apache.org > For additional commands, e-mail: user-h...@spark.apache.org > >