My code works fine with JSON input format (Spark 1.6 on Amazon EMR,
emr-5.0.0). I tried the Parquet format. Works fine for English data. When I
tried the Parquet format with some Japanese language text, I am getting this
weird stack-trace:
 *Caused by: java.lang.ClassCastException: optional binary element (UTF8) is
not a group*  /at org.apache.parquet.schema.Type.asGroupType(Type.java:202) 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at scala.Option.map(Option.scala:146)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at scala.collection.Iterator$class.foreach(Iterator.scala:893)  at
scala.collection.AbstractIterator.foreach(Iterator.scala:1336)  at
scala.collection.IterableLike$class.foreach(IterableLike.scala:72)  at
org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)  at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)  at
org.apache.spark.sql.types.StructType.map(StructType.scala:95)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at scala.Option.map(Option.scala:146)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at scala.collection.Iterator$class.foreach(Iterator.scala:893)  at
scala.collection.AbstractIterator.foreach(Iterator.scala:1336)  at
scala.collection.IterableLike$class.foreach(IterableLike.scala:72)  at
org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)  at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)  at
org.apache.spark.sql.types.StructType.map(StructType.scala:95)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67)
 
at
org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168)
 
at
org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192)
 
at
org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339)
 
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
 
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91)
 
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
Source)  at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
 
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
 
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)  at
scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:211)  at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)  at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown
Source)  at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
Source)  at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
 
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
 
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)  at
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
 
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) 
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) 
at org.apache.spark.scheduler.Task.run(Task.scala:85)  at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)  ... 3
more/



--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/java-lang-ClassCastException-optional-binary-element-UTF8-is-not-a-group-tp27765.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

Reply via email to