[ 
https://issues.apache.org/jira/browse/SPARK-12110?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15036960#comment-15036960
 ] 

Patrick Wendell commented on SPARK-12110:
-----------------------------------------

Hey Andrew, could you show exactly the command you are running to run this 
example? Also, if you simply download Spark 1.5.1 and run the same command 
locally rather than in your modified EC2 cluster, does it work?

> spark-1.5.1-bin-hadoop2.6;  pyspark.ml.feature  Exception: ("You must build 
> Spark with Hive 
> --------------------------------------------------------------------------------------------
>
>                 Key: SPARK-12110
>                 URL: https://issues.apache.org/jira/browse/SPARK-12110
>             Project: Spark
>          Issue Type: Bug
>          Components: EC2
>    Affects Versions: 1.5.1
>         Environment: cluster created using 
> spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2
>            Reporter: Andrew Davidson
>
> I am using spark-1.5.1-bin-hadoop2.6. I used 
> spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2 to create a cluster and configured 
> spark-env to use python3. I can not run the tokenizer sample code. Is there a 
> work around?
> Kind regards
> Andy
> {code}
> /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
>     658             raise Exception("You must build Spark with Hive. "
>     659                             "Export 'SPARK_HIVE=true' and run "
> --> 660                             "build/sbt assembly", e)
>     661 
>     662     def _get_hive_ctx(self):
> Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
> build/sbt assembly", Py4JJavaError('An error occurred while calling 
> None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
> http://spark.apache.org/docs/latest/ml-features.html#tokenizer
> from pyspark.ml.feature import Tokenizer, RegexTokenizer
> sentenceDataFrame = sqlContext.createDataFrame([
>   (0, "Hi I heard about Spark"),
>   (1, "I wish Java could use case classes"),
>   (2, "Logistic,regression,models,are,neat")
> ], ["label", "sentence"])
> tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
> wordsDataFrame = tokenizer.transform(sentenceDataFrame)
> for words_label in wordsDataFrame.select("words", "label").take(3):
>   print(words_label)
> ---------------------------------------------------------------------------
> Py4JJavaError                             Traceback (most recent call last)
> /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
>     654             if not hasattr(self, '_scala_HiveContext'):
> --> 655                 self._scala_HiveContext = self._get_hive_ctx()
>     656             return self._scala_HiveContext
> /root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
>     662     def _get_hive_ctx(self):
> --> 663         return self._jvm.HiveContext(self._jsc.sc())
>     664 
> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in 
> __call__(self, *args)
>     700         return_value = get_return_value(answer, self._gateway_client, 
> None,
> --> 701                 self._fqn)
>     702 
> /root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
>      35         try:
> ---> 36             return f(*a, **kw)
>      37         except py4j.protocol.Py4JJavaError as e:
> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in 
> get_return_value(answer, gateway_client, target_id, name)
>     299                     'An error occurred while calling {0}{1}{2}.\n'.
> --> 300                     format(target_id, '.', name), value)
>     301             else:
> Py4JJavaError: An error occurred while calling 
> None.org.apache.spark.sql.hive.HiveContext.
> : java.lang.RuntimeException: java.io.IOException: Filesystem closed
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
>       at 
> org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:171)
>       at 
> org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:162)
>       at 
> org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:160)
>       at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:167)
>       at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
>       at 
> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
>       at 
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
>       at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
>       at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
>       at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
>       at py4j.Gateway.invoke(Gateway.java:214)
>       at 
> py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
>       at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
>       at py4j.GatewayConnection.run(GatewayConnection.java:207)
>       at java.lang.Thread.run(Thread.java:745)
> Caused by: java.io.IOException: Filesystem closed
>       at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:323)
>       at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1057)
>       at 
> org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:554)
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionState.java:599)
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:554)
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
>       ... 15 more
> During handling of the above exception, another exception occurred:
> Exception                                 Traceback (most recent call last)
> <ipython-input-1-0beb490d573c> in <module>()
>       5   (1, "I wish Java could use case classes"),
>       6   (2, "Logistic,regression,models,are,neat")
> ----> 7 ], ["label", "sentence"])
>       8 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
>       9 wordsDataFrame = tokenizer.transform(sentenceDataFrame)
> /root/spark/python/pyspark/sql/context.py in createDataFrame(self, data, 
> schema, samplingRatio)
>     406             rdd, schema = self._createFromLocal(data, schema)
>     407         jrdd = 
> self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
> --> 408         jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), 
> schema.json())
>     409         df = DataFrame(jdf, self)
>     410         df._schema = schema
> /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
>     658             raise Exception("You must build Spark with Hive. "
>     659                             "Export 'SPARK_HIVE=true' and run "
> --> 660                             "build/sbt assembly", e)
>     661 
>     662     def _get_hive_ctx(self):
> Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
> build/sbt assembly", Py4JJavaError('An error occurred while calling 
> None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to