With 80,000 features and 1000 clusters, you need 80,000,000 doubles to
store the cluster centers. That is ~600MB. If there are 10 partitions,
you might need 6GB on the driver to collect updates from workers. I
guess the driver died. Did you specify driver memory with
spark-submit? -Xiangrui

On Thu, Jun 18, 2015 at 12:22 PM, Rogers Jeffrey
<rogers.john2...@gmail.com> wrote:
> Hi All,
>
> I am trying to run KMeans clustering on a large data set with 12,000 points
> and 80,000 dimensions.  I have a spark cluster in Ec2 stand alone mode  with
> 8  workers running on 2 slaves with 160 GB Ram and 40 VCPU.
>
> My Code is as Follows:
>
> def convert_into_sparse_vector(A):
>     non_nan_indices=np.nonzero(~np.isnan(A) )
>     non_nan_values=A[non_nan_indices]
>     dictionary=dict(zip(non_nan_indices[0],non_nan_values))
>     return Vectors.sparse (len(A),dictionary)
>
> X=[convert_into_sparse_vector(A) for A in complete_dataframe.values ]
> sc=SparkContext(appName="parallel_kmeans")
> data=sc.parallelize(X,10)
> model = KMeans.train(data, 1000, initializationMode="k-means||")
>
> where complete_dataframe is a pandas data frame that has my data.
>
> I get the error: Py4JNetworkError: An error occurred while trying to connect
> to the Java server.
>
> The error  trace is as follows:
>
>> ---------------------------------------- Exception happened during
>> processing of request from ('127.0.0.1', 41360) Traceback (most recent
>> call last):   File "/usr/lib64/python2.6/SocketServer.py", line 283,
>> in _handle_request_noblock
>>     self.process_request(request, client_address)   File
>> "/usr/lib64/python2.6/SocketServer.py", line 309, in process_request
>>     self.finish_request(request, client_address)   File
>> "/usr/lib64/python2.6/SocketServer.py", line 322, in finish_request
>>     self.RequestHandlerClass(request, client_address, self)   File
>> "/usr/lib64/python2.6/SocketServer.py", line 617, in __init__
>>     self.handle()   File "/root/spark/python/pyspark/accumulators.py",
>> line 235, in handle
>>     num_updates = read_int(self.rfile)   File
>> "/root/spark/python/pyspark/serializers.py", line 544, in read_int
>>     raise EOFError EOFError
>> ----------------------------------------
>>
>> ---------------------------------------------------------------------------
>> Py4JNetworkError                          Traceback (most recent call
>> last) <ipython-input-13-3dd00c2c5e93> in <module>()
>> ----> 1 model = KMeans.train(data, 1000, initializationMode="k-means||")
>>
>> /root/spark/python/pyspark/mllib/clustering.pyc in train(cls, rdd, k,
>> maxIterations, runs, initializationMode, seed, initializationSteps,
>> epsilon)
>>     134         """Train a k-means clustering model."""
>>     135         model = callMLlibFunc("trainKMeansModel",
>> rdd.map(_convert_to_vector), k, maxIterations,
>> --> 136                               runs, initializationMode, seed,
>> initializationSteps, epsilon)
>>     137         centers = callJavaFunc(rdd.context, model.clusterCenters)
>>     138         return KMeansModel([c.toArray() for c in centers])
>>
>> /root/spark/python/pyspark/mllib/common.pyc in callMLlibFunc(name,
>> *args)
>>     126     sc = SparkContext._active_spark_context
>>     127     api = getattr(sc._jvm.PythonMLLibAPI(), name)
>> --> 128     return callJavaFunc(sc, api, *args)
>>     129
>>     130
>>
>> /root/spark/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func,
>> *args)
>>     119     """ Call Java Function """
>>     120     args = [_py2java(sc, a) for a in args]
>> --> 121     return _java2py(sc, func(*args))
>>     122
>>     123
>>
>> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
>> __call__(self, *args)
>>     534             END_COMMAND_PART
>>     535
>> --> 536         answer = self.gateway_client.send_command(command)
>>     537         return_value = get_return_value(answer,
>> self.gateway_client,
>>     538                 self.target_id, self.name)
>>
>> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
>> send_command(self, command, retry)
>>     367             if retry:
>>     368                 #print_exc()
>> --> 369                 response = self.send_command(command)
>>     370             else:
>>     371                 response = ERROR
>>
>> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
>> send_command(self, command, retry)
>>     360          the Py4J protocol.
>>     361         """
>> --> 362         connection = self._get_connection()
>>     363         try:
>>     364             response = connection.send_command(command)
>>
>> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
>> _get_connection(self)
>>     316             connection = self.deque.pop()
>>     317         except Exception:
>> --> 318             connection = self._create_connection()
>>     319         return connection
>>     320
>>
>> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
>> _create_connection(self)
>>     323         connection = GatewayConnection(self.address, self.port,
>>     324                 self.auto_close, self.gateway_property)
>> --> 325         connection.start()
>>     326         return connection
>>     327
>>
>> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
>> start(self)
>>     430                 'server'
>>     431             logger.exception(msg)
>> --> 432             raise Py4JNetworkError(msg)
>>     433
>>     434     def close(self):
>>
>> Py4JNetworkError: An error occurred while trying to connect to the
>> Java server
>
>
> Is there any specific setting that I am missing , that causes this error?
>
> Thanks and Regards,
> Rogers Jeffrey L

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to