[
https://issues.apache.org/jira/browse/PARQUET-1036?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16056358#comment-16056358
]
Ashima Sood commented on PARQUET-1036:
--------------------------------------
17/06/20 13:07:50 INFO FileSourceStrategy: Pruning directories with:
17/06/20 13:07:50 INFO FileSourceStrategy: Post-Scan Filters:
17/06/20 13:07:50 INFO FileSourceStrategy: Output Data Schema: struct<DATE:
binary, ROW_ID: binary, STATUS: binary, GEN_TIME: binary, GEN_MONTH: binary ...
109 more fields>
17/06/20 13:07:50 INFO FileSourceStrategy: Pushed Filters:
17/06/20 13:07:50 INFO deprecation: mapred.job.id is deprecated. Instead, use
mapreduce.job.id
17/06/20 13:07:50 INFO deprecation: mapred.tip.id is deprecated. Instead, use
mapreduce.task.id
17/06/20 13:07:50 INFO deprecation: mapred.task.id is deprecated. Instead, use
mapreduce.task.attempt.id
17/06/20 13:07:50 INFO deprecation: mapred.task.is.map is deprecated. Instead,
use mapreduce.task.ismap
17/06/20 13:07:50 INFO deprecation: mapred.task.partition is deprecated.
Instead, use mapreduce.task.partition
17/06/20 13:07:50 INFO SQLHadoopMapReduceCommitProtocol: Using output committer
class org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
17/06/20 13:07:50 ERROR FileFormatWriter: Aborting job null.
org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange RoundRobinPartitioning(1)
+- FileScan parquet [DATE#0,ROW_ID#1,STATUS#2,GEN_TIME#3,GEN_MONTH#4.....,...
109 more fields] Batched: false, Format: Parquet, Location:
InMemoryFileIndex[file:/<<<<filePath>>>>>..., PartitionFilters: [],
PushedFilters: [], ReadSchema:
struct<DATE:binary,ROW_ID:binary,STATUS:binary,GEN_TIME:binary,GEN_M...
at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:112)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:127)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:121)
at
org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:101)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:492)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:215)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:198)
at org.apache.spark.sql.DataFrameWriter.json(DataFrameWriter.scala:473)
at com.fmrco.mdm.utils.SparkUtil.saveJSONFile(SparkUtil.java:145)
at com.fmrco.mdm.utils.SparkUtil.SaveToFileFormat(SparkUtil.java:632)
at com.fmrco.mdm.utils.SparkUtil.loadFileSrcData(SparkUtil.java:486)
at com.fmrco.mdm.spark.framework.Framework.processJob(Framework.java:31)
at com.fmrco.mdm.app.MDMApp.main(MDMApp.java:88)
Caused by: org.apache.spark.sql.AnalysisException: Attribute name "Unnamed:
110" contains invalid character(s) among " ,;{}()\n\t=". Please use alias to
rename it.;
at
org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter$.checkConversionRequirement(ParquetSchemaConverter.scala:581)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter$.checkFieldName(ParquetSchemaConverter.scala:567)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter$$anonfun$checkFieldNames$1.apply(ParquetSchemaConverter.scala:575)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter$$anonfun$checkFieldNames$1.apply(ParquetSchemaConverter.scala:575)
at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter$.checkFieldNames(ParquetSchemaConverter.scala:575)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.buildReader(ParquetFileFormat.scala:306)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.buildReaderWithPartitionValues(ParquetFileFormat.scala:291)
at
org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:253)
at
org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:251)
at
org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:286)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:85)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:121)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:112)
at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
... 35 more
> parquet file created via pyarrow 0.4.0 ; version 1.0 - incompatible with Spark
> ------------------------------------------------------------------------------
>
> Key: PARQUET-1036
> URL: https://issues.apache.org/jira/browse/PARQUET-1036
> Project: Parquet
> Issue Type: Bug
> Reporter: Ashima Sood
> Priority: Blocker
>
> using spark sql unable to read parquet file and shows null values. whereas
> hive reads the values fine.
> 17/06/19 17:50:36 WARN CorruptStatistics: Ignoring statistics because
> created_by could not be parsed (see PARQUET-251): parquet-cpp version 1.0.0
> org.apache.parquet.VersionParser$VersionParseException: Could not parse
> created_by: parquet-cpp version 1.0.0 using format: (.+) version ((.*)
> )?\(build ?(.*)\)
> at
> org.apache.parquet.VersionParser.parse(VersionParser.java:112)
> at
> org.apache.parquet.CorruptStatistics.shouldIgnoreStatistics(CorruptStatistics.java:60)
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)