In Spark 1.2 you'll have to create a partitioned hive table
<https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-AddPartitions>
in order to read parquet data in this format.  In Spark 1.3 the parquet
data source will auto discover partitions when they are laid out in this
format.

Michael

On Mon, Jan 5, 2015 at 1:01 PM, Sam Flint <sam.fl...@magnetic.com> wrote:

> Below is the code that I am running.  I get an error for unresolved
> attributes.  Can anyone point me in the right direction?  Running from
> pyspark shell using yarn "MASTER=yarn-client pyspark"
>
> Error is below code:
>
>
> # Import SQLContext and data types
> from pyspark.sql import *
>
> # sc is an existing SparkContext.
> sqlContext = SQLContext(sc)
>
> # The result of loading a parquet file is also a SchemaRDD.
> # Try loading all data that you have
> parquetFile =
> sqlContext.parquetFile("/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.0.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.1.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.10.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.11.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.2.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.3.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.4.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.5.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.6.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.7.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.8.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.9.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.0.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.1.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.2.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.3.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.4.parq")
>
>
>
> # Parquet files can also be registered as tables and then used in SQL
> statements.
> parquetFile.registerTempTable("parquetFileone")
>
>
> results = sqlContext.sql("SELECT * FROM parquetFileone where key=20141001
> ")
>
> #print results
> for result in results.collect():
>   print result
>
>
>
> Traceback (most recent call last):
>   File "<stdin>", line 1, in <module>
>   File
> "/opt/cloudera/parcels/CDH-5.2.1-1.cdh5.2.1.p0.12/lib/spark/python/pyspark/sql.py",
> line 1615, in collect
>     rows = RDD.collect(self)
>   File
> "/opt/cloudera/parcels/CDH-5.2.1-1.cdh5.2.1.p0.12/lib/spark/python/pyspark/rdd.py",
> line 678, in collect
>     bytesInJava = self._jrdd.collect().iterator()
>   File
> "/opt/cloudera/parcels/CDH-5.2.1-1.cdh5.2.1.p0.12/lib/spark/python/pyspark/sql.py",
> line 1527, in _jrdd
>     self._lazy_jrdd = self._jschema_rdd.javaToPython()
>   File
> "/opt/cloudera/parcels/CDH-5.2.1-1.cdh5.2.1.p0.12/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py",
> line 538, in __call__
>   File
> "/opt/cloudera/parcels/CDH-5.2.1-1.cdh5.2.1.p0.12/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py",
> line 300, in get_return_value
> py4j.protocol.Py4JJavaError: An error occurred while calling
> o29.javaToPython.
> : org.apache.spark.sql.catalyst.errors.package$TreeNodeException:
> Unresolved attributes: *, tree:
> Project [*]
>  Filter ('key = 20141001)
>   Subquery parquetFileone
>    ParquetRelation
> /user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.0.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.1.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.10.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.11.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.2.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.3.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.4.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.5.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.6.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.7.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.8.parq,/user/hive/warehouse/impala_new_4/key=20141001/69446344000a3a17-c90aac1f33a0fbc_875501925_data.9.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.0.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.1.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.2.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.3.parq,/user/hive/warehouse/impala_new_4/key=20141001/f1448ca083a5e224-159572f61b50d7a3_854675293_data.4.parq,
> Some(Configuration: core-default.xml, core-site.xml, yarn-default.xml,
> yarn-site.xml, mapred-default.xml, mapred-site.xml, hdfs-default.xml,
> hdfs-site.xml), org.apache.spark.sql.SQLContext@2c76fd82, []
>
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$CheckResolution$$anonfun$apply$1.applyOrElse(Analyzer.scala:72)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$CheckResolution$$anonfun$apply$1.applyOrElse(Analyzer.scala:70)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:165)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:183)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformChildrenDown(TreeNode.scala:212)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:168)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:156)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$CheckResolution$.apply(Analyzer.scala:70)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$CheckResolution$.apply(Analyzer.scala:68)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
> at
> scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
> at
> scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
> at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
> at scala.collection.immutable.List.foreach(List.scala:318)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
> at
> org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:397)
> at
> org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:397)
> at org.apache.spark.sql.SchemaRDD.javaToPython(SchemaRDD.scala:412)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
> at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
> at py4j.Gateway.invoke(Gateway.java:259)
> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
> at py4j.commands.CallCommand.execute(CallCommand.java:79)
> at py4j.GatewayConnection.run(GatewayConnection.java:207)
> at java.lang.Thread.run(Thread.java:745)
>
> --
>
> *MAGNE**+**I**C*
>
> *Sam Flint* | *Lead Developer, Data Analytics*
>
>
>

Reply via email to