java.lang.ClassCastException: optional binary element (UTF8) is not a group

2016-09-20 Thread naveen.ra...@sony.com
My code works fine with JSON input format (Spark 1.6 on Amazon EMR,
emr-5.0.0). I tried the Parquet format. Works fine for English data. When I
tried the Parquet format with some Japanese language text, I am getting this
weird stack-trace:
 *Caused by: java.lang.ClassCastException: optional binary element (UTF8) is
not a group*  /at org.apache.parquet.schema.Type.asGroupType(Type.java:202) 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at scala.Option.map(Option.scala:146)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at scala.collection.Iterator$class.foreach(Iterator.scala:893)  at
scala.collection.AbstractIterator.foreach(Iterator.scala:1336)  at
scala.collection.IterableLike$class.foreach(IterableLike.scala:72)  at
org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)  at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)  at
org.apache.spark.sql.types.StructType.map(StructType.scala:95)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
 
at scala.Option.map(Option.scala:146)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
 
at scala.collection.Iterator$class.foreach(Iterator.scala:893)  at
scala.collection.AbstractIterator.foreach(Iterator.scala:1336)  at
scala.collection.IterableLike$class.foreach(IterableLike.scala:72)  at
org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)  at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)  at
org.apache.spark.sql.types.StructType.map(StructType.scala:95)  at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67)
 
at
org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168)
 
at
org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192)
 
at
org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377)
 
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339)
 
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
 
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91

java.lang.ClassCastException: optional binary element (UTF8) is not a group

2016-09-20 Thread Rajan, Naveen
Dear All,
My code works fine with JSON input data. When I tried the Parquet data 
format, it worked for English data. For Japanese text, I am getting the below 
stack-trace. Pls help!

Caused by: java.lang.ClassCastException: optional binary element (UTF8) is not 
a group
  at org.apache.parquet.schema.Type.asGroupType(Type.java:202)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
  at scala.Option.map(Option.scala:146)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269)
  at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.Iterator$class.foreach(Iterator.scala:893)
  at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
  at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
  at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)
  at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
  at org.apache.spark.sql.types.StructType.map(StructType.scala:95)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272)
  at scala.Option.map(Option.scala:146)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269)
  at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.Iterator$class.foreach(Iterator.scala:893)
  at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
  at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
  at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)
  at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
  at org.apache.spark.sql.types.StructType.map(StructType.scala:95)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67)
  at 
org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168)
  at 
org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192)
  at 
org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377)
  at 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339)
  at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
  at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91