I am using pyspark spark-1.6.1-bin-hadoop2.6 and python3. I have a data
frame with a column I need to convert to a sparse vector. I get an exception
Any idea what my bug is?
Kind regards
Andy
Py4JJavaError: An error occurred while calling
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException: java.lang.RuntimeException: Unable to
instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
at
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20
4)
Here is my python code fragment with a more complete stack trace
# load data set
from pyspark.sql import HiveContext #,SQLContext, Row
# window functions require HiveContext (spark 2.x will not require hive)
#sqlContext = SQLContext(sc)
hiveSqlContext = HiveContext(sc)
import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import VectorUDT
#sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
# = 3 = size
# [0,1] int indices
#[1.0, 3.0] values
"""
root
|-- id: string (nullable = true)
|-- samples: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: long (nullable = false)
| | |-- rateStr: string (nullable = false)
"""
def toSparseVector(pojoList) :
indicies = []
for pojo in pojoList :
indicies.append(pojo.id)
l = np.ones(len(indicies))
v = Vectors.spark(numDimensions, indicies, l)
return v
myUDF = udf(toSparseVector, VectorUDT()))
features = df.withColumn(newColName, myUDF(df[³samples"]))
Py4JJavaError Traceback (most recent call last)
<ipython-input-77-30ab820130a0> in <module>()
30 #myUDF = udf(lambda pojoList: labelStr if (labelStr == "noise") else
"injury", StringType())
31
---> 32 myUDF = udf(toSparseVector, VectorUDT()) #
33 features = df.withColumn(newColName, myUDF(df["follows"]))
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functi
ons.py in udf(f, returnType)
1595 [Row(slen=5), Row(slen=3)]
1596 """
-> 1597 return UserDefinedFunction(f, returnType)
1598
1599 blacklist = ['map', 'since', 'ignore_unicode_prefix']
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functi
ons.py in __init__(self, func, returnType, name)
1556 self.returnType = returnType
1557 self._broadcast = None
-> 1558 self._judf = self._create_judf(name)
1559
1560 def _create_judf(self, name):
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functi
ons.py in _create_judf(self, name)
1567 pickled_command, broadcast_vars, env, includes =
_prepare_for_python_RDD(sc, command, self)
1568 ctx = SQLContext.getOrCreate(sc)
-> 1569 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())
1570 if name is None:
1571 name = f.__name__ if hasattr(f, '__name__') else
f.__class__.__name__
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/contex
t.py in _ssql_ctx(self)
681 try:
682 if not hasattr(self, '_scala_HiveContext'):
--> 683 self._scala_HiveContext = self._get_hive_ctx()
684 return self._scala_HiveContext
685 except Py4JError as e:
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/contex
t.py in _get_hive_ctx(self)
690
691 def _get_hive_ctx(self):
--> 692 return self._jvm.HiveContext(self._jsc.sc())
693
694 def refreshTable(self, tableName):
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.z
ip/py4j/java_gateway.py in __call__(self, *args)
1062 answer = self._gateway_client.send_command(command)
1063 return_value = get_return_value(
-> 1064 answer, self._gateway_client, None, self._fqn)
1065
1066 for temp_arg in temp_args:
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.
py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/p
y4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client,
target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException: java.lang.RuntimeException: Unable to
instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
at
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20
4)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces
sorImpl.java:62)
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc
torAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
at
org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedC
lientLoader.scala:249)
at
org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.sc
ala:327)
at
org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:237)
at org.apache.spark.sql.hive.HiveContext.setConf(HiveContext.scala:441)
at
org.apache.spark.sql.hive.HiveContext.defaultOverrides(HiveContext.scala:226
)
at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:229)
at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101)