Dear All, My code works fine with JSON input data. When I tried the Parquet data format, it worked for English data. For Japanese text, I am getting the below stack-trace. Pls help!
Caused by: java.lang.ClassCastException: optional binary element (UTF8) is not a group at org.apache.parquet.schema.Type.asGroupType(Type.java:202) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67) at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168) at org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192) at org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439) at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:211) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) ... 3 more Regards, Naveen ________________________________ This email is confidential and intended only for the use of the individual or entity named above and may contain information that is privileged. If you are not the intended recipient, you are notified that any dissemination, distribution or copying of this email is strictly prohibited. If you have received this email in error, please notify us immediately by return email or telephone and destroy the original message. - This mail is sent via Sony Asia Pacific Mail Gateway..