Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/17785#discussion_r113851718
--- Diff: sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
---
@@ -92,48 +93,8 @@ private[sql] object SQLUtils extends Logging {
def r: Regex = new Regex(sc.parts.mkString, sc.parts.tail.map(_ =>
"x"): _*)
}
- def getSQLDataType(dataType: String): DataType = {
- dataType match {
- case "byte" => org.apache.spark.sql.types.ByteType
- case "integer" => org.apache.spark.sql.types.IntegerType
- case "float" => org.apache.spark.sql.types.FloatType
- case "double" => org.apache.spark.sql.types.DoubleType
- case "numeric" => org.apache.spark.sql.types.DoubleType
- case "character" => org.apache.spark.sql.types.StringType
- case "string" => org.apache.spark.sql.types.StringType
- case "binary" => org.apache.spark.sql.types.BinaryType
- case "raw" => org.apache.spark.sql.types.BinaryType
- case "logical" => org.apache.spark.sql.types.BooleanType
- case "boolean" => org.apache.spark.sql.types.BooleanType
- case "timestamp" => org.apache.spark.sql.types.TimestampType
- case "date" => org.apache.spark.sql.types.DateType
- case r"\Aarray<(.+)${elemType}>\Z" =>
- org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
- case r"\Amap<(.+)${keyType},(.+)${valueType}>\Z" =>
- if (keyType != "string" && keyType != "character") {
- throw new IllegalArgumentException("Key type of a map must be
string or character")
- }
- org.apache.spark.sql.types.MapType(getSQLDataType(keyType),
getSQLDataType(valueType))
- case r"\Astruct<(.+)${fieldsStr}>\Z" =>
- if (fieldsStr(fieldsStr.length - 1) == ',') {
- throw new IllegalArgumentException(s"Invalid type $dataType")
- }
- val fields = fieldsStr.split(",")
- val structFields = fields.map { field =>
- field match {
- case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" =>
- createStructField(fieldName, fieldType, true)
-
- case _ => throw new IllegalArgumentException(s"Invalid type
$dataType")
- }
- }
- createStructType(structFields)
- case _ => throw new IllegalArgumentException(s"Invalid type
$dataType")
- }
- }
-
def createStructField(name: String, dataType: String, nullable:
Boolean): StructField = {
- val dtObj = getSQLDataType(dataType)
+ val dtObj = CatalystSqlParser.parseDataType(dataType)
--- End diff --
Up to my knowledge, `getSQLDataType ` supports the types below:
```
binary
boolean
byte
character
date
double
float
integer
logical
numeric
raw
string
timestamp
array<...>
struct<...>
map<...>
```
and these look required to be _case-sensitive_ whereas `parseDataType`
supports ...
```
bigint
binary
boolean
byte
char
date
decimal
double
float
int
integer
long
short
smallint
string
timestamp
tinyint
varchar
array<...>
struct<...>
map<...>
```
and these look _case-insensitive_.
I think the inital intention for `getSQLDataType` was to support R type
string conversions but they look unreachable codes now because we were checking
the type strings before actually calling `getSQLDataType` in
[`checkType`](https://github.com/apache/spark/blob/39e2bad6a866d27c3ca594d15e574a1da3ee84cc/R/pkg/R/schema.R#L129-L187).
If the types are not in `!is.null(PRIMITIVE_TYPES[[type]])`
(_case-sensitive_), it looks throwing an error.
```
bigint
binary
boolean
byte
date
decimal
double
float
int
integer
smallint
string
timestamp
tinyint
array<...>
map<...>
struct<...>
```
In short, I think there should not be a behaviour change below types
(intersection between `getSQLDataType` and `parseDataType`) ...
```
binary
string
double
float
boolean
timestamp
date
integer
byte
array<...>
map<...>
struct<...>
```
and these should be case-sensitive.
_Additionally_, we will support the types below (which are written in R's
[`PREMISITVE_TYPES`](https://github.com/apache/spark/blob/bc0a0e6392c4e729d8f0e4caffc0bd05adb0d950/R/pkg/R/types.R#L21-L39)
but `getSQLDataType` did not support before):
```
tinyint
smallint
int
bigint
```
**Before**
```r
> structField("_col", "tinyint")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type tinyint
at
org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at
org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
...
```
```r
> structField("_col", "smallint")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type smallint
at
org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at
org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
...
```
```r
> structField("_col", "int")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type int
at
org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at
org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
...
```
```r
> structField("_col", "bigint")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type bigint
at
org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at
org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
...
```
**After**
```r
> structField("_col", "tinyint")
StructField(name = "_col", type = "ByteType", nullable = TRUE)>
```
```r
> structField("_col", "smallint")
StructField(name = "_col", type = "ShortType", nullable = TRUE)>
```
```r
> structField("_col", "int")
StructField(name = "_col", type = "IntegerType", nullable = TRUE)>
```
```r
> structField("_col", "bigint")
StructField(name = "_col", type = "LongType", nullable = TRUE)>
```
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]