My code works fine with JSON input format (Spark 1.6 on Amazon EMR, emr-5.0.0). I tried the Parquet format. Works fine for English data. When I tried the Parquet format with some Japanese language text, I am getting this weird stack-trace: *Caused by: java.lang.ClassCastException: optional binary element (UTF8) is not a group* /at org.apache.parquet.schema.Type.asGroupType(Type.java:202) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67) at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168) at org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192) at org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439) at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:211) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) ... 3 more/
-- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/java-lang-ClassCastException-optional-binary-element-UTF8-is-not-a-group-tp27765.html Sent from the Apache Spark User List mailing list archive at Nabble.com.