Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/21667#discussion_r199515847
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala
---
@@ -42,63 +38,27 @@ object DataSourceUtils {
/**
* Verify if the schema is supported in datasource. This verification
should be done
- * in a driver side, e.g., `prepareWrite`, `buildReader`, and
`buildReaderWithPartitionValues`
- * in `FileFormat`.
- *
- * Unsupported data types of csv, json, orc, and parquet are as follows;
- * csv -> R/W: Interval, Null, Array, Map, Struct
- * json -> W: Interval
- * orc -> W: Interval, Null
- * parquet -> R/W: Interval, Null
+ * in a driver side.
*/
private def verifySchema(format: FileFormat, schema: StructType,
isReadPath: Boolean): Unit = {
- def throwUnsupportedException(dataType: DataType): Unit = {
- throw new UnsupportedOperationException(
- s"$format data source does not support ${dataType.simpleString}
data type.")
- }
-
- def verifyType(dataType: DataType): Unit = dataType match {
- case BooleanType | ByteType | ShortType | IntegerType | LongType |
FloatType | DoubleType |
- StringType | BinaryType | DateType | TimestampType | _:
DecimalType =>
-
- // All the unsupported types for CSV
- case _: NullType | _: CalendarIntervalType | _: StructType | _:
ArrayType | _: MapType
- if format.isInstanceOf[CSVFileFormat] =>
- throwUnsupportedException(dataType)
-
- case st: StructType => st.foreach { f => verifyType(f.dataType) }
-
- case ArrayType(elementType, _) => verifyType(elementType)
-
- case MapType(keyType, valueType, _) =>
- verifyType(keyType)
- verifyType(valueType)
-
- case udt: UserDefinedType[_] => verifyType(udt.sqlType)
-
- // Interval type not supported in all the write path
- case _: CalendarIntervalType if !isReadPath =>
- throwUnsupportedException(dataType)
-
- // JSON and ORC don't support an Interval type, but we pass it in
read pass
- // for back-compatibility.
- case _: CalendarIntervalType if format.isInstanceOf[JsonFileFormat]
||
- format.isInstanceOf[OrcFileFormat] =>
+ def verifyType(dataType: DataType): Unit = {
+ if (!format.supportDataType(dataType, isReadPath)) {
+ throw new UnsupportedOperationException(
+ s"$format data source does not support ${dataType.simpleString}
data type.")
+ }
+ dataType match {
--- End diff --
Wait .. why we do the recursive thing here? What if the top level type is
supported but nested is not? For example, Arrow doesn't currently support
nested timestamp conversion for localization issue.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]