[
https://issues.apache.org/jira/browse/SPARK-38038?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
ShiHang Gao updated SPARK-38038:
--------------------------------
Description:
I want to use DataSource V2 OrcTable to read a Hive partitioned table, The
directory of the partition table is as follows:
{code:java}
hadoop fs -ls /user/hive/warehouse/f_audience_cpx_daily
Found 2 items
/user/hive/warehouse/f_audience_cpx_daily/20220103
/user/hive/warehouse/f_audience_cpx_daily/20220104 {code}
And when I read, I get this error:
{code:java}
Exception in thread "main" org.apache.spark.sql.AnalysisException: Unable to
infer schema for ORC. It must be specified manually.;
at
org.apache.spark.sql.execution.datasources.v2.FileTable.$anonfun$dataSchema$5(FileTable.scala:71)
at scala.Option.getOrElse(Option.scala:189)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.dataSchema$lzycompute(FileTable.scala:71)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.dataSchema(FileTable.scala:63)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.schema$lzycompute(FileTable.scala:82)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.schema(FileTable.scala:80)
at
org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:150)
at
org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:158)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveTables$$lookupV2Relation(Analyzer.scala:924)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$$anonfun$apply$8.applyOrElse(Analyzer.scala:886)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$$anonfun$apply$8.applyOrElse(Analyzer.scala:884)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUp$3(AnalysisHelper.scala:90)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUp$1(AnalysisHelper.scala:90)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:86)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:84)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:29)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.apply(Analyzer.scala:884)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.apply(Analyzer.scala:883)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:149)
at
scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
at
scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
at scala.collection.immutable.List.foldLeft(List.scala:89)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:146)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:138)
at scala.collection.immutable.List.foreach(List.scala:392)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:138)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:176)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:170)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:130)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:116)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:116)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:154)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:68)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:133)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:133)
at
org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:68)
at
org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:66)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:58)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:91)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:89)
at org.apache.spark.sql.SparkSession.table(SparkSession.scala:585)
at org.apache.spark.sql.SparkSession.table(SparkSession.scala:581) {code}
was:
I want to use DataSource V2 ORCTable to read a Hive partitioned table, The
directory of the partition table is as follows:
{code:java}
hadoop fs -ls /user/hive/warehouse/f_audience_cpx_daily
Found 2 items
/user/hive/warehouse/f_audience_cpx_daily/20220103
/user/hive/warehouse/f_audience_cpx_daily/20220104 {code}
And when I read, I get this error:
{code:java}
Exception in thread "main" org.apache.spark.sql.AnalysisException: Unable to
infer schema for ORC. It must be specified manually.;
at
org.apache.spark.sql.execution.datasources.v2.FileTable.$anonfun$dataSchema$5(FileTable.scala:71)
at scala.Option.getOrElse(Option.scala:189)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.dataSchema$lzycompute(FileTable.scala:71)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.dataSchema(FileTable.scala:63)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.schema$lzycompute(FileTable.scala:82)
at
org.apache.spark.sql.execution.datasources.v2.FileTable.schema(FileTable.scala:80)
at
org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:150)
at
org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:158)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveTables$$lookupV2Relation(Analyzer.scala:924)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$$anonfun$apply$8.applyOrElse(Analyzer.scala:886)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$$anonfun$apply$8.applyOrElse(Analyzer.scala:884)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUp$3(AnalysisHelper.scala:90)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUp$1(AnalysisHelper.scala:90)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:86)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:84)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:29)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.apply(Analyzer.scala:884)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.apply(Analyzer.scala:883)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:149)
at
scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
at
scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
at scala.collection.immutable.List.foldLeft(List.scala:89)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:146)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:138)
at scala.collection.immutable.List.foreach(List.scala:392)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:138)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:176)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:170)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:130)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:116)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:116)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:154)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:68)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:133)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:133)
at
org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:68)
at
org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:66)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:58)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:91)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:89)
at org.apache.spark.sql.SparkSession.table(SparkSession.scala:585)
at org.apache.spark.sql.SparkSession.table(SparkSession.scala:581) {code}
> DataSourceV2 OrcTable can't support read partition doesn't contains "="
> -----------------------------------------------------------------------
>
> Key: SPARK-38038
> URL: https://issues.apache.org/jira/browse/SPARK-38038
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 3.0.1
> Reporter: ShiHang Gao
> Priority: Critical
> Labels: bulk-closed
>
> I want to use DataSource V2 OrcTable to read a Hive partitioned table, The
> directory of the partition table is as follows:
> {code:java}
> hadoop fs -ls /user/hive/warehouse/f_audience_cpx_daily
> Found 2 items
> /user/hive/warehouse/f_audience_cpx_daily/20220103
> /user/hive/warehouse/f_audience_cpx_daily/20220104 {code}
> And when I read, I get this error:
> {code:java}
> Exception in thread "main" org.apache.spark.sql.AnalysisException: Unable to
> infer schema for ORC. It must be specified manually.;
> at
> org.apache.spark.sql.execution.datasources.v2.FileTable.$anonfun$dataSchema$5(FileTable.scala:71)
> at scala.Option.getOrElse(Option.scala:189)
> at
> org.apache.spark.sql.execution.datasources.v2.FileTable.dataSchema$lzycompute(FileTable.scala:71)
> at
> org.apache.spark.sql.execution.datasources.v2.FileTable.dataSchema(FileTable.scala:63)
> at
> org.apache.spark.sql.execution.datasources.v2.FileTable.schema$lzycompute(FileTable.scala:82)
> at
> org.apache.spark.sql.execution.datasources.v2.FileTable.schema(FileTable.scala:80)
> at
> org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:150)
> at
> org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:158)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveTables$$lookupV2Relation(Analyzer.scala:924)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$$anonfun$apply$8.applyOrElse(Analyzer.scala:886)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$$anonfun$apply$8.applyOrElse(Analyzer.scala:884)
> at
> org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUp$3(AnalysisHelper.scala:90)
> at
> org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
> at
> org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUp$1(AnalysisHelper.scala:90)
> at
> org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
> at
> org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:86)
> at
> org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:84)
> at
> org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:29)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.apply(Analyzer.scala:884)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables$.apply(Analyzer.scala:883)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:149)
> at
> scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
> at
> scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
> at scala.collection.immutable.List.foldLeft(List.scala:89)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:146)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:138)
> at scala.collection.immutable.List.foreach(List.scala:392)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:138)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:176)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:170)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:130)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:116)
> at
> org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:116)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:154)
> at
> org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
> at
> org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:153)
> at
> org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:68)
> at
> org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
> at
> org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:133)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
> at
> org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:133)
> at
> org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:68)
> at
> org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:66)
> at
> org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:58)
> at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:91)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
> at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:89)
> at org.apache.spark.sql.SparkSession.table(SparkSession.scala:585)
> at org.apache.spark.sql.SparkSession.table(SparkSession.scala:581) {code}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]