gene-db commented on code in PR #45826:
URL: https://github.com/apache/spark/pull/45826#discussion_r1552733414
##########
python/pyspark/sql/types.py:
##########
@@ -1468,6 +1475,36 @@ def __eq__(self, other: Any) -> bool:
return type(self) == type(other)
+class VariantVal:
Review Comment:
I'm not sure what this comment means?
##########
python/pyspark/sql/tests/test_types.py:
##########
@@ -1406,6 +1407,68 @@ def test_calendar_interval_type_with_sf(self):
schema1 = self.spark.range(1).select(F.make_interval(F.lit(1))).schema
self.assertEqual(schema1.fields[0].dataType, CalendarIntervalType())
+ def test_variant_type(self):
+ from decimal import Decimal
+ self.assertEqual(VariantType().simpleString(), "variant")
+
+ # Holds a tuple of (key, json string value, python value)
+ expected_values = [
+ ("str", '"%s"' % ("0123456789" * 10), "0123456789" * 10),
+ ("short_str", '"abc"', "abc"),
+ ("null", "null", None),
+ ("true", "true", True),
+ ("false", "false", False),
+ ("int1", "1", 1),
+ ("-int1", "-5", -5),
+ ("int2", "257", 257),
+ ("-int2", "-124", -124),
+ ("int4", "65793", 65793),
+ ("-int4", "-69633", -69633),
+ ("int8", "4295033089", 4295033089),
+ ("-int8", "-4294967297", -4294967297),
+ ("float4", "1.23456789e-30", 1.23456789e-30),
+ ("-float4", "-4.56789e+29", -4.56789e+29),
+ ("dec4", "123.456", Decimal("123.456")),
+ ("-dec4", "-321.654", Decimal("-321.654")),
+ ("dec8", "429.4967297", Decimal("429.4967297")),
+ ("-dec8", "-5.678373902", Decimal("-5.678373902")),
+ ("dec16", "467440737095.51617", Decimal("467440737095.51617")),
+ ("-dec16", "-67.849438003827263", Decimal("-67.849438003827263")),
+ ("arr", '[1.1,"2",[3],{"4":5}]', [Decimal("1.1"), "2", [3], {"4":
5}]),
+ ("obj", '{"a":["123",{"b":2}],"c":3}', {"a": ["123", {"b": 2}],
"c": 3}),
+ ]
+ json_str = "{%s}" % ",".join(['"%s": %s' % (t[0], t[1]) for t in
expected_values])
+
+ df = self.spark.createDataFrame([({"json": json_str})])
Review Comment:
Hrmmm, I will have to look into that. My guess is that it won't work
currently, so I may need to add that functionality.
##########
python/pyspark/sql/types.py:
##########
@@ -1468,6 +1475,36 @@ def __eq__(self, other: Any) -> bool:
return type(self) == type(other)
+class VariantVal:
+ """
+ A class to represent a Variant value in Python.
Review Comment:
Updated the comments.
##########
python/pyspark/sql/types.py:
##########
@@ -1468,6 +1475,36 @@ def __eq__(self, other: Any) -> bool:
return type(self) == type(other)
+class VariantVal:
+ """
+ A class to represent a Variant value in Python.
+ """
+
+ def __init__(self, value: bytes, metadata: bytes):
+ self.value = value
+ self.metadata = metadata
+
+ def __str__(self) -> str:
+ return self.toString()
+
+ def __repr__(self) -> str:
+ return "VariantVal(%s, %s)" % (self.value, self.metadata)
+
+ def toString(self) -> str:
+ """
+ Convert the VariantVal to a string.
+ :return: a string representation of the Variant
Review Comment:
Updated to follow the numpydoc style.
##########
python/pyspark/sql/variant_utils.py:
##########
@@ -0,0 +1,385 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import decimal
+import json
+import struct
+from array import array
+from typing import Any
+
+
+class VariantUtils:
+ """
+ A utility class for VariantVal.
+
+ Adapted from library at: org.apache.spark.types.variant.VariantUtil
+ """
+
+ BASIC_TYPE_BITS = 2
+ BASIC_TYPE_MASK = 0x3
+ TYPE_INFO_MASK = 0x3F
+ # The inclusive maximum value of the type info value. It is the size limit
of `SHORT_STR`.
+ MAX_SHORT_STR_SIZE = 0x3F
+
+ # Below is all possible basic type values.
+ # Primitive value. The type info value must be one of the values in the
below section.
+ PRIMITIVE = 0
+ # Short string value. The type info value is the string size, which must
be in `[0,
+ # MAX_SHORT_STR_SIZE]`.
+ # The string content bytes directly follow the header byte.
+ SHORT_STR = 1
+ # Object value. The content contains a size, a list of field ids, a list
of field offsets, and
+ # the actual field data. The length of the id list is `size`, while the
length of the offset
+ # list is `size + 1`, where the last offset represent the total size of
the field data. The
+ # fields in an object must be sorted by the field name in alphabetical
order. Duplicate field
+ # names in one object are not allowed.
+ # We use 5 bits in the type info to specify the integer type of the object
header: it should
+ # be 0_b4_b3b2_b1b0 (MSB is 0), where:
+ # - b4 specifies the type of size. When it is 0/1, `size` is a
little-endian 1/4-byte
+ # unsigned integer.
+ # - b3b2/b1b0 specifies the integer type of id and offset. When the 2 bits
are 0/1/2, the
+ # list contains 1/2/3-byte little-endian unsigned integers.
+ OBJECT = 2
+ # Array value. The content contains a size, a list of field offsets, and
the actual element
+ # data. It is similar to an object without the id list. The length of the
offset list
+ # is `size + 1`, where the last offset represent the total size of the
element data.
+ # Its type info should be: 000_b2_b1b0:
+ # - b2 specifies the type of size.
+ # - b1b0 specifies the integer type of offset.
+ ARRAY = 3
+
+ # Below is all possible type info values for `PRIMITIVE`.
+ # JSON Null value. Empty content.
+ NULL = 0
+ # True value. Empty content.
+ TRUE = 1
+ # False value. Empty content.
+ FALSE = 2
+ # 1-byte little-endian signed integer.
+ INT1 = 3
+ # 2-byte little-endian signed integer.
+ INT2 = 4
+ # 4-byte little-endian signed integer.
+ INT4 = 5
+ # 4-byte little-endian signed integer.
+ INT8 = 6
+ # 8-byte IEEE double.
+ DOUBLE = 7
+ # 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed
integer.
+ DECIMAL4 = 8
+ # 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed
integer.
+ DECIMAL8 = 9
+ # 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed
integer.
+ DECIMAL16 = 10
+ # Long string value. The content is (4-byte little-endian unsigned integer
representing the
+ # string size) + (size bytes of string content).
+ LONG_STR = 16
+
+ U32_SIZE = 4
+
+ @classmethod
+ def to_json(cls, value: bytes, metadata: bytes) -> str:
+ """
+ Convert the VariantVal to a JSON string.
+ :return: JSON string
+ """
+ return cls._to_json(value, metadata, 0)
+
+ @classmethod
+ def to_python(cls, value: bytes, metadata: bytes) -> str:
+ """
+ Convert the VariantVal to a nested Python object of Python data types.
+ :return: Python representation of the Variant nested structure
+ """
+ return cls._to_python(value, metadata, 0)
+
+ @classmethod
+ def _read_long(cls, data: bytes, pos: int, num_bytes: int, signed: bool)
-> int:
+ cls._check_index(pos, len(data))
+ cls._check_index(pos + num_bytes - 1, len(data))
+ return int.from_bytes(data[pos : pos + num_bytes], byteorder="little",
signed=signed)
+
+ @classmethod
+ def _check_index(cls, pos: int, length: int) -> None:
+ if pos < 0 or pos >= length:
+ raise Exception("Malformed Variant")
Review Comment:
Thanks! I updated to reuse an existing exception class.
##########
python/pyspark/sql/variant_utils.py:
##########
@@ -0,0 +1,376 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import decimal
+import json
+import struct
+from array import array
+from typing import Any
+
+class VariantUtils:
Review Comment:
I originally left it out of `types.py` to avoid making that file too much
larger with this Variant-specific code. Are there some downsides to keeping it
separate?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]