Hi Jeff
Sorry I did not respond sooner. I was out of town
Here is the code I use to initialize the HiveContext
# load data set
from pyspark.sql import HiveContext #,SQLContext, Row
# window functions require HiveContext (spark 2.x will not require hive)
#sqlContext = SQLContext(sc)
hiveSqlContext = HiveContext(sc)
Here is the complete stack trace. Could the problem be due to the size of
numDimensions?
numDimensions = 713912692155621377
The indices are sorted, not sure why this exception is raised
p/pyspark/mllib/linalg/__init__.py", line 531, in __init__
raise TypeError("indices array must be sorted")
TypeError: indices array must be sorted
import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import VectorUDT
#sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
# = 3 = size
# [0,1] int indices
#[1.0, 3.0] values
"""
root
|-- id: string (nullable = true)
|-- follows: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: long (nullable = false)
| | |-- screenName: string (nullable = false)
"""
def toSparseVector(pojoList) :
indices = []
for pojo in pojoList :
indices.append(pojo.id)
sortedIndices = sorted(indices)
logical = np.ones(len(sortedIndices))
vec = Vectors.sparse(numDimensions, sortedIndices, logical)
return vec
#myUDF = udf(lambda pojoList: labelStr if (labelStr == "noise") else
"injury", StringType())
newColName = "features"
myUDF = udf(toSparseVector, VectorUDT())
featuresDF = df.withColumn(newColName, myUDF(df["follows"]))
In [16]:
featuresDF.printSchema()
featuresDF.show()
root
|-- id: string (nullable = true)
|-- follows: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: long (nullable = false)
| | |-- screenName: string (nullable = false)
|-- features: vector (nullable = true)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-16-6f7c439ddd93> in <module>()
1 featuresDF.printSchema()
----> 2 featuresDF.show()
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/datafr
ame.py in show(self, n, truncate)
255 +---+-----+
256 """
--> 257 print(self._jdf.showString(n, truncate))
258
259 def __repr__(self):
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.z
ip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.
py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.z
ip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id,
name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o128.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 16.0 failed 1 times, most recent failure: Lost task 0.0 in stage
16.0 (TID 219, localhost): org.apache.spark.api.python.PythonException:
Traceback (most recent call last):
File
"/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p
yspark/worker.py", line 111, in main
process()
File
"/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p
yspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/
pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File
"/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/funct
ions.py", line 1563, in <lambda>
func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it)
File "<ipython-input-15-9076fa544242>", line 28, in toSparseVector
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/
pyspark.zip/pyspark/mllib/linalg/__init__.py", line 827, in sparse
return SparseVector(size, *args)
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/
pyspark.zip/pyspark/mllib/linalg/__init__.py", line 531, in __init__
raise TypeError("indices array must be sorted")
TypeError: indices array must be sorted
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at
org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at
org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap
ply(python.scala:405)
at
org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap
ply(python.scala:370)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD
D.scala:710)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD
D.scala:710)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:11
42)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:6
17)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGSchedu
ler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGSched
uler.scala:1419)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGSched
uler.scala:1418)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply
(DAGScheduler.scala:799)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply
(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.sca
la:799)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGSched
uler.scala:1640)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSchedul
er.scala:1599)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSchedul
er.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:212)
at
org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:165
)
at
org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scal
a:174)
at
org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$exec
ute$1$1.apply(DataFrame.scala:1499)
at
org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$exec
ute$1$1.apply(DataFrame.scala:1499)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution
.scala:56)
at
org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086)
at
org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$execute$1(Dat
aFrame.scala:1498)
at
org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$collect(DataF
rame.scala:1505)
at
org.apache.spark.sql.DataFrame$$anonfun$head$1.apply(DataFrame.scala:1375)
at
org.apache.spark.sql.DataFrame$$anonfun$head$1.apply(DataFrame.scala:1374)
at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099)
at org.apache.spark.sql.DataFrame.head(DataFrame.scala:1374)
at org.apache.spark.sql.DataFrame.take(DataFrame.scala:1456)
at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:170)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62
)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl
.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most
recent call last):
File
"/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p
yspark/worker.py", line 111, in main
process()
File
"/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p
yspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/
pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pysp
ark/sql/functions.py", line 1563, in <lambda>
func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it)
File "<ipython-input-15-9076fa544242>", line 28, in toSparseVector
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/
pyspark.zip/pyspark/mllib/linalg/__init__.py", line 827, in sparse
return SparseVector(size, *args)
File
"/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/
pyspark.zip/pyspark/mllib/linalg/__init__.py", line 531, in __init__
raise TypeError("indices array must be sorted")
TypeError: indices array must be sorted
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at
org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at
org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap
ply(python.scala:405)
at
org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap
ply(python.scala:370)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD
D.scala:710)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD
D.scala:710)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:11
42)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:6
17)
... 1 more
From: Jeff Zhang <[email protected]>
Date: Tuesday, March 29, 2016 at 10:34 PM
To: Andrew Davidson <[email protected]>
Cc: "user @spark" <[email protected]>
Subject: Re: pyspark unable to convert dataframe column to a vector: Unable
to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
> According the stack trace, it seems the HiveContext is not initialized
> correctly. Do you have any more error message ?
>
> On Tue, Mar 29, 2016 at 9:29 AM, Andy Davidson <[email protected]>
> wrote:
>> I am using pyspark spark-1.6.1-bin-hadoop2.6 and python3. I have a data frame
>> with a column I need to convert to a sparse vector. I get an exception
>>
>> Any idea what my bug is?
>>
>> Kind regards
>>
>> Andy
>>
>>
>> Py4JJavaError: An error occurred while calling
>> None.org.apache.spark.sql.hive.HiveContext.
>> : java.lang.RuntimeException: java.lang.RuntimeException: Unable to
>> instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
>> at
>> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
>> at
>>
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204>>
)
>>
>> Here is my python code fragment with a more complete stack trace
>>
>> # load data set
>> from pyspark.sql import HiveContext #,SQLContext, Row
>>
>> # window functions require HiveContext (spark 2.x will not require hive)
>> #sqlContext = SQLContext(sc)
>> hiveSqlContext = HiveContext(sc)
>>
>> …
>>
>> import numpy as np
>> from pyspark.mllib.linalg import Vectors
>> from pyspark.mllib.linalg import VectorUDT
>>
>> #sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
>> # = 3 = size
>> # [0,1] int indices
>> #[1.0, 3.0] values
>>
>>
>> """
>> root
>> |-- id: string (nullable = true)
>> |-- samples: array (nullable = true)
>> | |-- element: struct (containsNull = true)
>> | | |-- id: long (nullable = false)
>> | | |-- rateStr: string (nullable = false)
>>
>> """
>>
>> def toSparseVector(pojoList) :
>> indicies = []
>> for pojo in pojoList :
>> indicies.append(pojo.id <http://pojo.id> )
>>
>> l = np.ones(len(indicies))
>> v = Vectors.spark(numDimensions, indicies, l)
>> return v
>>
>> myUDF = udf(toSparseVector, VectorUDT()))
>> features = df.withColumn(newColName, myUDF(df[“samples"]))
>>
>>
>> Py4JJavaError Traceback (most recent call last)
>> <ipython-input-77-30ab820130a0> in <module>() 30 #myUDF = udf(lambda
>> pojoList: labelStr if (labelStr == "noise") else "injury", StringType())
>> 31 ---> 32 myUDF = udf(toSparseVector, VectorUDT()) # 33 features =
>> df.withColumn(newColName,
>> myUDF(df["follows"]))/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/pytho
>> n/pyspark/sql/functions.py in udf(f, returnType) 1595 [Row(slen=5),
>> Row(slen=3)] 1596 """
>> -> 1597 return UserDefinedFunction(f, returnType) 1598 1599
>> blacklist = ['map', 'since',
>> 'ignore_unicode_prefix']/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/py
>> thon/pyspark/sql/functions.py in __init__(self, func, returnType, name)
>> 1556 self.returnType = returnType 1557 self._broadcast =
>> None-> 1558 self._judf = self._create_judf(name) 1559 1560
>> def _create_judf(self,
>> name):/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/f
>> unctions.py in _create_judf(self, name) 1567 pickled_command,
>> broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
>> 1568 ctx = SQLContext.getOrCreate(sc)-> 1569 jdt =
>> ctx._ssql_ctx.parseDataType(self.returnType.json()) 1570 if name is
>> None: 1571 name = f.__name__ if hasattr(f, '__name__') else
>> f.__class__.__name__/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python
>> /pyspark/sql/context.py in _ssql_ctx(self) 681 try: 682
>> if not hasattr(self, '_scala_HiveContext'):--> 683
>> self._scala_HiveContext = self._get_hive_ctx() 684 return
>> self._scala_HiveContext 685 except Py4JError as
>> e:/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/conte
>> xt.py in _get_hive_ctx(self) 690 691 def _get_hive_ctx(self):-->
>> 692 return self._jvm.HiveContext(self._jsc.sc()) 693 694
>> def refreshTable(self,
>> tableName):/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j
>> -0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 1062
>> answer = self._gateway_client.send_command(command) 1063
>> return_value = get_return_value(
>> -> 1064 answer, self._gateway_client, None, self._fqn)
>> 1065 1066 for temp_arg in
>> temp_args:/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/s
>> ql/utils.py in deco(*a, **kw) 43 def deco(*a, **kw): 44
>> try:---> 45 return f(*a, **kw) 46 except
>> py4j.protocol.Py4JJavaError as e: 47 s =
>> e.java_exception.toString()/Users/andrewdavidson/workSpace/spark/spark-1.6.1-
>> bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py in
>> get_return_value(answer, gateway_client, target_id, name) 306
>> raise Py4JJavaError(
>> 307 "An error occurred while calling
>> {0}{1}{2}.\n".--> 308 format(target_id, ".", name),
>> value)
>> 309 else: 310 raise Py4JError(
>>
>> Py4JJavaError: An error occurred while calling
>> None.org.apache.spark.sql.hive.HiveContext.
>> : java.lang.RuntimeException: java.lang.RuntimeException: Unable to
>> instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
>> at
>> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
>> at
>>
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204>>
)
>> at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
>> at
>> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccess
>> orImpl.java:62)
>> at
>> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruct
>> orAccessorImpl.java:45)
>> at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
>> at
>> org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedCl
>> ientLoader.scala:249)
>> at
>> org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.sca
>> la:327)
>> at
>> org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:237)
>> at org.apache.spark.sql.hive.HiveContext.setConf(HiveContext.scala:441)
>> at
>> org.apache.spark.sql.hive.HiveContext.defaultOverrides(HiveContext.scala:226)
>> at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:229)
>> at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101)
>>
>
>
>
> --
> Best Regards
>
> Jeff Zhang