Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/17785#discussion_r113851718 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala --- @@ -92,48 +93,8 @@ private[sql] object SQLUtils extends Logging { def r: Regex = new Regex(sc.parts.mkString, sc.parts.tail.map(_ => "x"): _*) } - def getSQLDataType(dataType: String): DataType = { - dataType match { - case "byte" => org.apache.spark.sql.types.ByteType - case "integer" => org.apache.spark.sql.types.IntegerType - case "float" => org.apache.spark.sql.types.FloatType - case "double" => org.apache.spark.sql.types.DoubleType - case "numeric" => org.apache.spark.sql.types.DoubleType - case "character" => org.apache.spark.sql.types.StringType - case "string" => org.apache.spark.sql.types.StringType - case "binary" => org.apache.spark.sql.types.BinaryType - case "raw" => org.apache.spark.sql.types.BinaryType - case "logical" => org.apache.spark.sql.types.BooleanType - case "boolean" => org.apache.spark.sql.types.BooleanType - case "timestamp" => org.apache.spark.sql.types.TimestampType - case "date" => org.apache.spark.sql.types.DateType - case r"\Aarray<(.+)${elemType}>\Z" => - org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType)) - case r"\Amap<(.+)${keyType},(.+)${valueType}>\Z" => - if (keyType != "string" && keyType != "character") { - throw new IllegalArgumentException("Key type of a map must be string or character") - } - org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType)) - case r"\Astruct<(.+)${fieldsStr}>\Z" => - if (fieldsStr(fieldsStr.length - 1) == ',') { - throw new IllegalArgumentException(s"Invalid type $dataType") - } - val fields = fieldsStr.split(",") - val structFields = fields.map { field => - field match { - case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" => - createStructField(fieldName, fieldType, true) - - case _ => throw new IllegalArgumentException(s"Invalid type $dataType") - } - } - createStructType(structFields) - case _ => throw new IllegalArgumentException(s"Invalid type $dataType") - } - } - def createStructField(name: String, dataType: String, nullable: Boolean): StructField = { - val dtObj = getSQLDataType(dataType) + val dtObj = CatalystSqlParser.parseDataType(dataType) --- End diff -- Up to my knowledge, `getSQLDataType ` supports the types below: ``` binary boolean byte character date double float integer logical numeric raw string timestamp array<...> struct<...> map<...> ``` and these look required to be _case-sensitive_ whereas `parseDataType` supports ... ``` bigint binary boolean byte char date decimal double float int integer long short smallint string timestamp tinyint varchar array<...> struct<...> map<...> ``` and these look _case-insensitive_. I think the inital intention for `getSQLDataType` was to support R type string conversions but they look unreachable codes now because we were checking the type strings before actually calling `getSQLDataType` in [`checkType`](https://github.com/apache/spark/blob/39e2bad6a866d27c3ca594d15e574a1da3ee84cc/R/pkg/R/schema.R#L129-L187). If the types are not in `!is.null(PRIMITIVE_TYPES[[type]])` (_case-sensitive_), it looks throwing an error. ``` bigint binary boolean byte date decimal double float int integer smallint string timestamp tinyint array<...> map<...> struct<...> ``` In short, I think there should not be a behaviour change below types (intersection between `getSQLDataType` and `parseDataType`) ... ``` binary string double float boolean timestamp date integer byte array<...> map<...> struct<...> ``` and these should be case-sensitive. _Additionally_, we will support the types below (which are written in R's [`PREMISITVE_TYPES`](https://github.com/apache/spark/blob/bc0a0e6392c4e729d8f0e4caffc0bd05adb0d950/R/pkg/R/types.R#L21-L39) but `getSQLDataType` did not support before): ``` tinyint smallint int bigint ``` **Before** ```r > structField("_col", "tinyint") ... Error in handleErrors(returnStatus, conn) : java.lang.IllegalArgumentException: Invalid type tinyint at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131) at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136) at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala) ... ``` ```r > structField("_col", "smallint") ... Error in handleErrors(returnStatus, conn) : java.lang.IllegalArgumentException: Invalid type smallint at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131) at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136) at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala) ... ``` ```r > structField("_col", "int") ... Error in handleErrors(returnStatus, conn) : java.lang.IllegalArgumentException: Invalid type int at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131) at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136) at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala) ... ``` ```r > structField("_col", "bigint") ... Error in handleErrors(returnStatus, conn) : java.lang.IllegalArgumentException: Invalid type bigint at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131) at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136) at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala) ... ``` **After** ```r > structField("_col", "tinyint") StructField(name = "_col", type = "ByteType", nullable = TRUE)> ``` ```r > structField("_col", "smallint") StructField(name = "_col", type = "ShortType", nullable = TRUE)> ``` ```r > structField("_col", "int") StructField(name = "_col", type = "IntegerType", nullable = TRUE)> ``` ```r > structField("_col", "bigint") StructField(name = "_col", type = "LongType", nullable = TRUE)> ```
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org