gene-db commented on code in PR #47907:
URL: https://github.com/apache/spark/pull/47907#discussion_r1735534432


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala:
##########
@@ -128,7 +128,7 @@ object Cast extends QueryErrorsBase {
     case (TimestampType, _: NumericType) => true
 
     case (VariantType, _) => variant.VariantGet.checkDataType(to)
-    case (_, VariantType) => variant.VariantGet.checkDataType(from)
+    case (_, VariantType) => variant.VariantGet.checkDataType(from, 
allowStructsAndMaps = false)

Review Comment:
   We should leave a comment here and for the other one saying we are not 
allowing structs and maps to cast to variant, since the cast would be lossy if 
it converts to a variant object.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala:
##########
@@ -114,6 +114,85 @@ case class IsVariantNull(child: Expression) extends 
UnaryExpression
     copy(child = newChild)
 }
 
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Convert a nested input (array/map/struct) into a 
variant where maps and structs are converted to variant objects which are 
unordered unlike SQL structs. Input maps can only have string keys.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_(named_struct('a', 1, 'b', 2));
+       {"a":1,"b":2}
+      > SELECT _FUNC_(array(1, 2, 3));
+       [1, 2, 3]
+      > SELECT _FUNC_(array(named_struct('a', 1)));
+       [{"a":1}]
+      > SELECT _FUNC_(array(map("a", 2)));
+       [{"a":2}]
+  """,
+  since = "4.0.0",
+  group = "variant_funcs")
+// scalastyle:on line.size.limit
+case class ToVariantObject(child: Expression)
+    extends UnaryExpression
+    with NullIntolerant
+    with QueryErrorsBase {
+
+  override val dataType: DataType = VariantType
+
+  // Only accept nested types at the root but any types can be nested inside.
+  override def checkInputDataTypes(): TypeCheckResult = {
+    val checkResult: Boolean = child.dataType match {
+      case ArrayType(elementType, _) =>
+        VariantGet.checkDataType(elementType, allowStructsAndMaps = true)
+      case MapType(_: StringType, valueType, _) =>
+        VariantGet.checkDataType(valueType, allowStructsAndMaps = true)
+      case StructType(fields) =>
+        fields.forall(f => VariantGet.checkDataType(f.dataType, 
allowStructsAndMaps = true))

Review Comment:
   I feel like this could be simplified like:
   
   ```suggestion
         case _: StructType | _: ArrayType | _: MapType =>
           VariantGet.checkDataType(child.dataType, allowStructsAndMaps = true)
   ```
   since `VariantGet.checkDataType` wil handle the all the type checking.



##########
python/pyspark/sql/tests/test_functions.py:
##########
@@ -1364,6 +1364,13 @@ def test_try_parse_json(self):
         self.assertEqual("""{"a":1}""", actual[0]["var"])
         self.assertEqual(None, actual[1]["var"])
 
+    def test_to_variant_object(self):
+        df = self.spark.createDataFrame([(1, {"a": 1})], "i int, v struct<a 
int>")

Review Comment:
   Can we test a few more cases, like maps<string, struct>, array<struct>?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala:
##########
@@ -663,7 +745,21 @@ object SchemaOfVariant {
   /** The actual implementation of the `SchemaOfVariant` expression. */
   def schemaOfVariant(input: VariantVal): UTF8String = {
     val v = new Variant(input.getValue, input.getMetadata)
-    UTF8String.fromString(schemaOf(v).sql)
+    UTF8String.fromString(printSchema(schemaOf(v)))
+  }
+
+  /**
+   * Similar to `dataType.sql`. The only difference is that `StructType` is 
shown as
+   * `OBJECT<...>` rather than `STRUCT<...>`.

Review Comment:
   We should note that the reason we do this is because `schemaOf` uses 
`struct` for a variant object, when it is not a true struct (struct fields are 
ordered, object fields are unordered). Therefore, we have to "translate" the 
`struct` back to the string "OBJECT".



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to