cloud-fan commented on a change in pull request #24372: [SPARK-27462][SQL]
Enhance insert into hive table that could choose some columns in target table
flexibly.
URL: https://github.com/apache/spark/pull/24372#discussion_r281964750
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
##########
@@ -332,24 +334,58 @@ case class PreprocessTableInsertion(conf: SQLConf)
extends Rule[LogicalPlan] {
private def preprocess(
insert: InsertIntoTable,
tblName: String,
+ insertedCols: Option[Seq[String]],
partColNames: Seq[String]): InsertIntoTable = {
val normalizedPartSpec = PartitioningUtils.normalizePartitionSpec(
insert.partition, partColNames, tblName, conf.resolver)
val staticPartCols = normalizedPartSpec.filter(_._2.isDefined).keySet
+ val selectedCols = if (insertedCols == None) {
+ insert.table.output
+ } else {
+ val tableCols = insert.table.output.map(_.name)
+ val noexistsCols = insertedCols.get.filterNot(col =>
tableCols.contains(col))
+ if (noexistsCols.size > 0) {
+ throw new AnalysisException(s"Table $tblName does not exists these
columns: $noexistsCols.")
+ }
+ insert.table.output.filter(a => insertedCols.get.contains(a.name))
+ }
val expectedColumns = insert.table.output.filterNot(a =>
staticPartCols.contains(a.name))
if (expectedColumns.length != insert.query.schema.length) {
throw new AnalysisException(
- s"$tblName requires that the data to be inserted have the same number
of columns as the " +
- s"target table: target table has ${insert.table.output.size}
column(s) but the " +
- s"inserted data has ${insert.query.output.length +
staticPartCols.size} column(s), " +
+ s"$tblName requires that the data to be inserted have the same number
of columns as " +
+ s"the number of columns selected in the target table: the number of
columns selected " +
+ s"has ${expectedColumns.length + staticPartCols.size} column(s) but
the inserted data " +
+ s"has ${insert.query.output.length + staticPartCols.size} column(s),
" +
s"including ${staticPartCols.size} partition column(s) having
constant value(s).")
}
+ val tableColumns = insert.table.output.filterNot(a =>
staticPartCols.contains(a.name))
+ val tableCols = tableColumns.map(_.name)
+ val filledQuery = if (insertedCols == None) {
+ insert.query
+ } else {
+ // Because `HiveFileFormat` writes data according to the index of
columns which belongs
+ // target table, in order to align the data, we need to fill in some
empty expressions.
+ val cols = insertedCols.get
+ val project = insert.query.asInstanceOf[Project]
+ val filledProjectList = ArrayBuffer.empty[NamedExpression]
+ var i = 0
+ tableCols.foreach { tableCol =>
+ if (cols.contains(tableCol)) {
+ filledProjectList += project.projectList(i)
+ i += 1
+ } else {
+ filledProjectList += Alias(Literal(null, NullType), "NULL")()
Review comment:
what if the column is not nullable in the target table?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]