aokolnychyi commented on a change in pull request #1473: URL: https://github.com/apache/iceberg/pull/1473#discussion_r494426541
########## File path: spark3-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIcebergProcedures.scala ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Call, CallArgument, LogicalPlan} +import org.apache.spark.sql.catalyst.procedures.{IcebergProcedureRegistry, OptionalParameter, Parameter, Procedure} +import org.apache.spark.sql.catalyst.rules.Rule +import scala.collection.mutable + +// TODO: case sensitivity? +object ResolveIcebergProcedures extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case Call(nameParts, args) => + val procedure = IcebergProcedureRegistry.resolve(nameParts) + val values = buildProcedureValues(procedure, args) + procedure.createRunnableCommand(values) + } + + private def buildProcedureValues(procedure: Procedure, args: Seq[CallArgument]): Array[Any] = { + // build a map of declared parameter names to their positions + val params = procedure.parameters + val nameToPositionMap = params.map(_.name).zipWithIndex.toMap + + // verify named and positional args are not mixed + val containsNamedArg = args.exists(arg => arg.name.isDefined) + val containsPositionalArg = args.exists(arg => arg.name.isEmpty) + if (containsNamedArg && containsPositionalArg) { + throw new AnalysisException("Named and positional arguments cannot be mixed") + } + + // build a map of parameter names to args + val nameToArgMap = if (containsNamedArg) { + buildNameToArgMapUsingNames(args, nameToPositionMap) + } else { + buildNameToArgMapUsingPositions(args, params) + } + + // verify all required parameters are provided + params.filter(_.required) + .find(param => !nameToArgMap.contains(param.name)) + .foreach { missingArg => + throw new AnalysisException(s"Required procedure argument '${missingArg.name}' is missing") + } + + val values = new Array[Any](params.size) + + // convert provided args from internal Spark representation to Scala + nameToArgMap.foreach { case (name, arg) => + val position = nameToPositionMap(name) + val param = params(position) + val paramType = param.dataType + val argType = arg.expr.dataType + if (paramType != argType) { + throw new AnalysisException(s"Wrong arg type for '${param.name}': expected $paramType but got $argType") + } + values(position) = toScalaValue(arg.expr) + } + + // assign default values for optional params + params.foreach { + case p: OptionalParameter if !nameToArgMap.contains(p.name) => + val position = nameToPositionMap(p.name) + values(position) = p.defaultValue + case _ => + } + + values + } + + private def buildNameToArgMapUsingNames( + args: Seq[CallArgument], + nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = { + + val nameToArgMap = mutable.LinkedHashMap.empty[String, CallArgument] + args.foreach { arg => Review comment: @RussellSpitzer we could have this: ``` val namedArgs = args.asInstanceOf[Seq[NamedArgument]] namedArgs.groupBy(_.name) .foreach { case (name, matchingArgs) => if (matchingArgs.size > 1) { throw new AnalysisException(s"Duplicate procedure argument: '$name'") } if (!nameToPositionMap.contains(name)) { throw new AnalysisException(s"Unknown argument name: '$name'") } } namedArgs.map(arg => arg.name -> arg).toMap ``` But `groupBy(_.name)` will use a mutable map under the hood. Plus, the main motiviation of using `LinkedHashMap` was to preserve the order. Later, we iterate through this map and do some validation and it would probably make sense to validate the args in the order they are defined (probably not a big deal?). Writing a recursive function and building an immutable set would probably make things more complicated that it should. I am flexible here. ########## File path: spark3-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIcebergProcedures.scala ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Call, CallArgument, LogicalPlan} +import org.apache.spark.sql.catalyst.procedures.{IcebergProcedureRegistry, OptionalParameter, Parameter, Procedure} +import org.apache.spark.sql.catalyst.rules.Rule +import scala.collection.mutable + +// TODO: case sensitivity? +object ResolveIcebergProcedures extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case Call(nameParts, args) => + val procedure = IcebergProcedureRegistry.resolve(nameParts) + val values = buildProcedureValues(procedure, args) + procedure.createRunnableCommand(values) + } + + private def buildProcedureValues(procedure: Procedure, args: Seq[CallArgument]): Array[Any] = { + // build a map of declared parameter names to their positions + val params = procedure.parameters + val nameToPositionMap = params.map(_.name).zipWithIndex.toMap + + // verify named and positional args are not mixed + val containsNamedArg = args.exists(arg => arg.name.isDefined) + val containsPositionalArg = args.exists(arg => arg.name.isEmpty) + if (containsNamedArg && containsPositionalArg) { + throw new AnalysisException("Named and positional arguments cannot be mixed") + } + + // build a map of parameter names to args + val nameToArgMap = if (containsNamedArg) { + buildNameToArgMapUsingNames(args, nameToPositionMap) + } else { + buildNameToArgMapUsingPositions(args, params) + } + + // verify all required parameters are provided + params.filter(_.required) + .find(param => !nameToArgMap.contains(param.name)) + .foreach { missingArg => + throw new AnalysisException(s"Required procedure argument '${missingArg.name}' is missing") + } + + val values = new Array[Any](params.size) + + // convert provided args from internal Spark representation to Scala + nameToArgMap.foreach { case (name, arg) => + val position = nameToPositionMap(name) + val param = params(position) + val paramType = param.dataType + val argType = arg.expr.dataType + if (paramType != argType) { + throw new AnalysisException(s"Wrong arg type for '${param.name}': expected $paramType but got $argType") + } + values(position) = toScalaValue(arg.expr) + } + + // assign default values for optional params + params.foreach { + case p: OptionalParameter if !nameToArgMap.contains(p.name) => + val position = nameToPositionMap(p.name) + values(position) = p.defaultValue + case _ => + } + + values + } + + private def buildNameToArgMapUsingNames( + args: Seq[CallArgument], + nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = { + + val nameToArgMap = mutable.LinkedHashMap.empty[String, CallArgument] + args.foreach { arg => + val name = arg.name.get + + if (nameToArgMap.contains(name)) { + throw new AnalysisException(s"Duplicate procedure argument: '$name'") + } + + if (!nameToPositionMap.contains(name)) { + throw new AnalysisException(s"Unknown argument name: '$name'") + } + + nameToArgMap.put(name, arg) + } + nameToArgMap.toMap + } + + private def buildNameToArgMapUsingPositions( + args: Seq[CallArgument], + params: Seq[Parameter]): Map[String, CallArgument] = { + + val nameToArgMap = mutable.LinkedHashMap.empty[String, CallArgument] + args.zipWithIndex.foreach { case (arg, position) => Review comment: Could be easily updated if the order is not a concern. ``` if (args.size > params.size) { throw new AnalysisException("Too many arguments for procedure") } args.zipWithIndex.map { case (arg, position) => val param = params(position) param.name -> arg }.toMap ``` ########## File path: spark3-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIcebergProcedures.scala ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Call, CallArgument, LogicalPlan} +import org.apache.spark.sql.catalyst.procedures.{IcebergProcedureRegistry, OptionalParameter, Parameter, Procedure} +import org.apache.spark.sql.catalyst.rules.Rule +import scala.collection.mutable + +// TODO: case sensitivity? +object ResolveIcebergProcedures extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case Call(nameParts, args) => + val procedure = IcebergProcedureRegistry.resolve(nameParts) + val values = buildProcedureValues(procedure, args) + procedure.createRunnableCommand(values) + } + + private def buildProcedureValues(procedure: Procedure, args: Seq[CallArgument]): Array[Any] = { + // build a map of declared parameter names to their positions + val params = procedure.parameters + val nameToPositionMap = params.map(_.name).zipWithIndex.toMap + + // verify named and positional args are not mixed + val containsNamedArg = args.exists(arg => arg.name.isDefined) + val containsPositionalArg = args.exists(arg => arg.name.isEmpty) + if (containsNamedArg && containsPositionalArg) { + throw new AnalysisException("Named and positional arguments cannot be mixed") + } + + // build a map of parameter names to args + val nameToArgMap = if (containsNamedArg) { + buildNameToArgMapUsingNames(args, nameToPositionMap) + } else { + buildNameToArgMapUsingPositions(args, params) + } + + // verify all required parameters are provided + params.filter(_.required) + .find(param => !nameToArgMap.contains(param.name)) + .foreach { missingArg => + throw new AnalysisException(s"Required procedure argument '${missingArg.name}' is missing") + } + + val values = new Array[Any](params.size) + + // convert provided args from internal Spark representation to Scala + nameToArgMap.foreach { case (name, arg) => + val position = nameToPositionMap(name) + val param = params(position) + val paramType = param.dataType + val argType = arg.expr.dataType + if (paramType != argType) { + throw new AnalysisException(s"Wrong arg type for '${param.name}': expected $paramType but got $argType") + } + values(position) = toScalaValue(arg.expr) + } + + // assign default values for optional params + params.foreach { + case p: OptionalParameter if !nameToArgMap.contains(p.name) => + val position = nameToPositionMap(p.name) + values(position) = p.defaultValue + case _ => + } + + values + } + + private def buildNameToArgMapUsingNames( + args: Seq[CallArgument], + nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = { + + val nameToArgMap = mutable.LinkedHashMap.empty[String, CallArgument] + args.foreach { arg => Review comment: Agree, I think we better indicate all vioalations. I'll rework this part. ########## File path: spark3-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveProcedures.scala ########## @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Call, CallArgument, CallStatement, LogicalPlan, NamedArgument, PositionalArgument} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.{CatalogNotFoundException, CatalogPlugin, Identifier, Procedure, ProcedureCatalog, ProcedureParameter} +import scala.collection.{mutable, Seq} + +// TODO: case sensitivity? +object ResolveProcedures extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case CallStatement(nameParts, args) => + val (catalog, ident) = resolveCatalog(nameParts) + + val procedure = catalog.asProcedureCatalog.loadProcedure(ident) + validateParams(procedure) + validateMethodHandle(procedure) + + val argValues = prepareArgValues(procedure, args) + Call(procedure, argValues) + } + + private def resolveCatalog(nameParts: Seq[String]): (CatalogPlugin, Identifier) = { Review comment: I could use `LookupCatalog` from Spark here but that would be mean a strong depenency on internal rules. ########## File path: spark3-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveProcedures.scala ########## @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.plans.logical.{Call, CallArgument, CallStatement, LogicalPlan, NamedArgument, PositionalArgument} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.{CatalogNotFoundException, CatalogPlugin, Identifier, Procedure, ProcedureCatalog, ProcedureParameter} +import scala.collection.Seq + +object ResolveProcedures extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case CallStatement(nameParts, args) => + val (catalog, ident) = resolveCatalog(nameParts) + + val procedure = catalog.asProcedureCatalog.loadProcedure(ident) + + validateParams(procedure) + validateMethodHandle(procedure) + + Call(procedure, argValues = buildArgValues(procedure, args)) + } + + private def validateParams(procedure: Procedure): Unit = { + // should not be any duplicate param names + val duplicateParamNames = procedure.parameters.groupBy(_.name).collect { + case (name, matchingParams) if matchingParams.length > 1 => name + } + + if (duplicateParamNames.nonEmpty) { + throw new AnalysisException(s"Duplicate parameter names: ${duplicateParamNames.mkString("[", ",", "]")}") + } + + // optional params should be at the end + procedure.parameters.sliding(2).foreach { + case Array(previousParam, currentParam) if previousParam.required && !currentParam.required => + throw new AnalysisException("Optional parameters must be after required ones") + case _ => + } + } + + private def validateMethodHandle(procedure: Procedure): Unit = { + val params = procedure.parameters + val outputType = procedure.outputType + + val methodHandle = procedure.methodHandle + val methodType = methodHandle.`type` + val methodReturnType = methodType.returnType + + // method cannot accept var ags + if (methodHandle.isVarargsCollector) { + throw new AnalysisException("Method must have fixed arity") + } + + // verify the number of params in the procedure match the number of params in the method + if (params.length != methodType.parameterCount) { + throw new AnalysisException("Method parameter count must match the number of procedure parameters") + } + + // the MethodHandle API does not allow us to check the generic type + // so we only verify the return type is either void or iterable + + if (outputType.nonEmpty && methodReturnType != classOf[java.lang.Iterable[_]]) { + throw new AnalysisException( + s"Wrong method return type: $methodReturnType; the procedure defines $outputType " + + "as its output so must return java.lang.Iterable of Spark Rows") + } + + if (outputType.isEmpty && methodReturnType != classOf[Void]) { + throw new AnalysisException( + s"Wrong method return type: $methodReturnType; the procedure defines no output columns " + + "so must be void") + } + } + + private def buildArgValues(procedure: Procedure, args: Seq[CallArgument]): Array[Any] = { + val params = procedure.parameters + + // build a map of declared parameter names to their positions + val nameToPositionMap = params.map(_.name).zipWithIndex.toMap + + // build a map of parameter names to args + val nameToArgMap = buildNameToArgMap(params, args, nameToPositionMap) + + // verify all required parameters are provided + val missingParamNames = params.filter(_.required).collect { + case param if !nameToArgMap.contains(param.name) => param.name + } + + if (missingParamNames.nonEmpty) { + throw new AnalysisException(s"Missing required parameters: ${missingParamNames.mkString("[", ",", "]")}") + } + + val values = new Array[Any](params.size) + + // convert provided args from internal Spark representation to Scala + nameToArgMap.foreach { case (name, arg) => + val position = nameToPositionMap(name) + val param = params(position) + val paramType = param.dataType + val argType = arg.expr.dataType + if (paramType != argType) { + throw new AnalysisException(s"Wrong arg type for ${param.name}: expected $paramType but got $argType") + } + values(position) = toScalaValue(arg.expr) + } + + // assign default values to optional params + params.foreach { + case p if !p.required && !nameToArgMap.contains(p.name) => + val position = nameToPositionMap(p.name) + values(position) = p.defaultValue + case _ => + } + + values + } + + private def buildNameToArgMap( + params: Seq[ProcedureParameter], + args: Seq[CallArgument], + nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = { + + val containsNamedArg = args.exists(_.isInstanceOf[NamedArgument]) + val containsPositionalArg = args.exists(_.isInstanceOf[PositionalArgument]) + + if (containsNamedArg && containsPositionalArg) { + throw new AnalysisException("Named and positional arguments cannot be mixed") + } + + if (containsNamedArg) { + buildNameToArgMapUsingNames(args, nameToPositionMap) + } else { + buildNameToArgMapUsingPositions(args, params) + } + } + + private def buildNameToArgMapUsingNames( + args: Seq[CallArgument], + nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = { + + val namedArgs = args.asInstanceOf[Seq[NamedArgument]] + + val validationErrors = namedArgs.groupBy(_.name).collect { + case (name, matchingArgs) if matchingArgs.size > 1 => s"Duplicate procedure argument: $name" + case (name, _) if !nameToPositionMap.contains(name) => s"Unknown argument: $name" + } + + if (validationErrors.nonEmpty) { + throw new AnalysisException(s"Could not build name to arg map: ${validationErrors.mkString(", ")}") + } + + namedArgs.map(arg => arg.name -> arg).toMap + } + + private def buildNameToArgMapUsingPositions( + args: Seq[CallArgument], + params: Seq[ProcedureParameter]): Map[String, CallArgument] = { + + if (args.size > params.size) { + throw new AnalysisException("Too many arguments for procedure") + } + + args.zipWithIndex.map { case (arg, position) => + val param = params(position) + param.name -> arg + }.toMap + } + + private def toScalaValue(expr: Expression): Any = expr match { + case literal: Literal => CatalystTypeConverters.convertToScala(literal.value, literal.dataType) + case _ => throw new AnalysisException(s"Cannot convert '$expr' to a Scala literal value") + } + + private def resolveCatalog(nameParts: Seq[String]): (CatalogPlugin, Identifier) = { + val catalogManager = SparkSession.active.sessionState.catalogManager + + if (nameParts.length == 1) { + return (catalogManager.currentCatalog, Identifier.of(catalogManager.currentNamespace, nameParts.head)) + } + + try { + val catalogName = nameParts.head + val procedureNameParts = nameParts.tail + (catalogManager.catalog(catalogName), toIdentifier(procedureNameParts)) + } catch { + case _: CatalogNotFoundException => + (catalogManager.currentCatalog, toIdentifier(nameParts)) + } + } + + private def toIdentifier(nameParts: Seq[String]): Identifier = { Review comment: I could use `CatalogV2Implicits` and `LookupCatalog` here but I am not sure how safe it will be. ########## File path: spark3-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala ########## @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.parser.extensions + +import org.antlr.v4.runtime._ +import org.antlr.v4.runtime.atn.PredictionMode +import org.antlr.v4.runtime.misc.{Interval, ParseCancellationException} +import org.antlr.v4.runtime.tree.{ParseTree, TerminalNodeImpl} +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface} +import org.apache.spark.sql.catalyst.parser.ParserUtils._ +import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser._ +import org.apache.spark.sql.catalyst.plans.logical.{CallArgument, CallStatement, LogicalPlan, NamedArgument, PositionalArgument} +import org.apache.spark.sql.catalyst.trees.Origin +import org.apache.spark.sql.types.{ByteType, DataType, DoubleType, FloatType, LongType, ShortType, StructType} +import scala.collection.JavaConverters._ + +class IcebergSparkSqlExtensionsParser(delegate: ParserInterface) extends ParserInterface { + + private val astBuilder = new IcebergSqlExtensionsAstBuilder() + + /** + * Parse a string to a DataType. + */ + override def parseDataType(sqlText: String): DataType = { + delegate.parseDataType(sqlText) + } + + /** + * Parse a string to a raw DataType without CHAR/VARCHAR replacement. + */ + override def parseRawDataType(sqlText: String): DataType = { + delegate.parseRawDataType(sqlText) + } + + /** + * Parse a string to an Expression. + */ + override def parseExpression(sqlText: String): Expression = { + delegate.parseExpression(sqlText) + } + + /** + * Parse a string to a TableIdentifier. + */ + override def parseTableIdentifier(sqlText: String): TableIdentifier = { + delegate.parseTableIdentifier(sqlText) + } + + /** + * Parse a string to a FunctionIdentifier. + */ + override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { + delegate.parseFunctionIdentifier(sqlText) + } + + /** + * Parse a string to a multi-part identifier. + */ + override def parseMultipartIdentifier(sqlText: String): Seq[String] = { + delegate.parseMultipartIdentifier(sqlText) + } + + /** + * Creates StructType for a given SQL string, which is a comma separated list of field + * definitions which will preserve the correct Hive metadata. + */ + override def parseTableSchema(sqlText: String): StructType = { + delegate.parseTableSchema(sqlText) + } + + /** + * Parse a string to a LogicalPlan. + */ + override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) { parser => + astBuilder.visit(parser.singleStatement()) match { + case plan: LogicalPlan => plan + case _ => delegate.parsePlan(sqlText) + } + } + + protected def parse[T](command: String)(toResult: IcebergSqlExtensionsParser => T): T = { + val lexer = new IcebergSqlExtensionsLexer(new UpperCaseCharStream(CharStreams.fromString(command))) + lexer.removeErrorListeners() + lexer.addErrorListener(ParseErrorListener) + + val tokenStream = new CommonTokenStream(lexer) + val parser = new IcebergSqlExtensionsParser(tokenStream) + parser.addParseListener(PostProcessor) + parser.removeErrorListeners() + parser.addErrorListener(ParseErrorListener) + + try { + try { + // first, try parsing with potentially faster SLL mode + parser.getInterpreter.setPredictionMode(PredictionMode.SLL) + toResult(parser) + } + catch { + case _: ParseCancellationException => + // if we fail, parse with LL mode + tokenStream.seek(0) // rewind input stream + parser.reset() + + // Try Again. + parser.getInterpreter.setPredictionMode(PredictionMode.LL) + toResult(parser) + } + } + catch { + case e: ParseException if e.command.isDefined => + throw e + case e: ParseException => + throw e.withCommand(command) + case e: AnalysisException => + val position = Origin(e.line, e.startPosition) + throw new ParseException(Option(command), e.message, position, position) + } + } +} + +// literal parsing is taken from Spark's AstBuilder +class IcebergSqlExtensionsAstBuilder extends IcebergSqlExtensionsBaseVisitor[AnyRef] { + + override def visitCall(ctx: CallContext): LogicalPlan = { + val name = ctx.multipartIdentifier.parts.asScala.map(_.getText) + val args = ctx.callArgument.asScala.map(typedVisit[CallArgument]) + CallStatement(name, args) + } + + override def visitPositionalArgument(ctx: PositionalArgumentContext): CallArgument = { + val expr = typedVisit[Expression](ctx.expression) + PositionalArgument(expr) + } + + override def visitNamedArgument(ctx: NamedArgumentContext): CallArgument = { + val name = ctx.identifier.getText + val expr = typedVisit[Expression](ctx.expression) + NamedArgument(name, expr) + } + + override def visitNonIcebergCommand(ctx: NonIcebergCommandContext): LogicalPlan = null + + override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) { + visit(ctx.statement).asInstanceOf[LogicalPlan] + } + + /** + * Create an integral literal expression. The code selects the most narrow integral type + * possible, either a BigDecimal, a Long or an Integer is returned. + */ + override def visitIntegerLiteral(ctx: IntegerLiteralContext): Literal = withOrigin(ctx) { + BigDecimal(ctx.getText) match { + case v if v.isValidInt => + Literal(v.intValue) + case v if v.isValidLong => + Literal(v.longValue) + case v => Literal(v.underlying()) + } + } + + /** + * Create a Byte Literal expression. + */ + override def visitTinyIntLiteral(ctx: TinyIntLiteralContext): Literal = { + val rawStrippedQualifier = ctx.getText.substring(0, ctx.getText.length - 1) + numericLiteral(ctx, rawStrippedQualifier, Byte.MinValue, Byte.MaxValue, ByteType.simpleString)(_.toByte) + } + + /** + * Create a Short Literal expression. + */ + override def visitSmallIntLiteral(ctx: SmallIntLiteralContext): Literal = { + val rawStrippedQualifier = ctx.getText.substring(0, ctx.getText.length - 1) + numericLiteral(ctx, rawStrippedQualifier, Short.MinValue, Short.MaxValue, ShortType.simpleString)(_.toShort) + } + + /** + * Create a Long Literal expression. + */ + override def visitBigIntLiteral(ctx: BigIntLiteralContext): Literal = { + val rawStrippedQualifier = ctx.getText.substring(0, ctx.getText.length - 1) + numericLiteral(ctx, rawStrippedQualifier, Long.MinValue, Long.MaxValue, LongType.simpleString)(_.toLong) + } + + /** + * Create a Float Literal expression. + */ + override def visitFloatLiteral(ctx: FloatLiteralContext): Literal = { + val rawStrippedQualifier = ctx.getText.substring(0, ctx.getText.length - 1) + numericLiteral(ctx, rawStrippedQualifier, Float.MinValue, Float.MaxValue, FloatType.simpleString)(_.toFloat) + } + + /** + * Create a Double Literal expression. + */ + override def visitDoubleLiteral(ctx: DoubleLiteralContext): Literal = { + val rawStrippedQualifier = ctx.getText.substring(0, ctx.getText.length - 1) + numericLiteral(ctx, rawStrippedQualifier, Double.MinValue, Double.MaxValue, DoubleType.simpleString)(_.toDouble) + } + + /** + * Create a double literal for number with an exponent, e.g. 1E-30 + */ + override def visitExponentLiteral(ctx: ExponentLiteralContext): Literal = { + numericLiteral(ctx, ctx.getText, /* exponent values don't have a suffix */ + Double.MinValue, Double.MaxValue, DoubleType.simpleString)(_.toDouble) + } + + /** + * Create a decimal literal for a regular decimal number. + */ + override def visitDecimalLiteral(ctx: DecimalLiteralContext): Literal = withOrigin(ctx) { + Literal(BigDecimal(ctx.getText).underlying()) + } + + /** + * Create a BigDecimal Literal expression. + */ + override def visitBigDecimalLiteral(ctx: BigDecimalLiteralContext): Literal = { + val raw = ctx.getText.substring(0, ctx.getText.length - 2) + try { + Literal(BigDecimal(raw).underlying()) + } catch { + case e: AnalysisException => + throw new ParseException(e.message, ctx) + } + } + + /** Create a numeric literal expression. */ + private def numericLiteral( + ctx: NumberContext, + rawStrippedQualifier: String, + minValue: BigDecimal, + maxValue: BigDecimal, + typeName: String)(converter: String => Any): Literal = withOrigin(ctx) { + try { + val rawBigDecimal = BigDecimal(rawStrippedQualifier) + if (rawBigDecimal < minValue || rawBigDecimal > maxValue) { + throw new ParseException(s"Numeric literal ${rawStrippedQualifier} does not " + + s"fit in range [${minValue}, ${maxValue}] for type ${typeName}", ctx) + } + Literal(converter(rawStrippedQualifier)) + } catch { + case e: NumberFormatException => + throw new ParseException(e.getMessage, ctx) + } + } + + /** + * Create a Boolean literal expression. + */ + override def visitBooleanLiteral(ctx: BooleanLiteralContext): Literal = withOrigin(ctx) { + if (ctx.getText.toBoolean) { + Literal.TrueLiteral + } else { + Literal.FalseLiteral + } + } + + /** + * Create a String literal expression. + */ + override def visitStringLiteral(ctx: StringLiteralContext): Literal = withOrigin(ctx) { + Literal(createString(ctx)) + } + + // we ignore legacy spark.sql.parser.escapedStringLiterals to avoid a dependency on SQLConf in the parser Review comment: This part requires attention. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org