jorisvandenbossche commented on code in PR #44070:
URL: https://github.com/apache/arrow/pull/44070#discussion_r1771198883
##########
python/pyarrow/array.pxi:
##########
@@ -4343,6 +4343,32 @@ cdef class ExtensionArray(Array):
return result
+class JsonArray(ExtensionArray):
+ """
+ Concrete class for Arrow arrays of JSON data type.
+ This does not guarantee that the JSON data actually
Review Comment:
```suggestion
Concrete class for Arrow arrays of JSON data type.
This does not guarantee that the JSON data actually
```
(blank line between first summary line and the extended explanation)
##########
python/pyarrow/tests/parquet/test_data_types.py:
##########
@@ -510,3 +510,16 @@ def test_large_binary_overflow():
pa.ArrowInvalid,
match="Parquet cannot store strings with size 2GB or more"):
_write_table(table, writer, use_dictionary=use_dictionary)
+
+
[email protected]("storage_type", (
+ pa.utf8(), pa.large_utf8(), pa.string(), pa.large_string()))
+def test_json_extension_type(storage_type):
+ data = ['{"a": 1}', '{"b": 2}', None]
+ storage = pa.array(data, type=storage_type)
+ json_type = pa.json_(storage_type)
+
+ arr = pa.ExtensionArray.from_storage(json_type, storage)
+ table = pa.table([arr], names=["ext"])
+
+ _simple_table_roundtrip(table, use_dictionary=False)
Review Comment:
Does it not work with use_dictionary=True ?
##########
python/pyarrow/array.pxi:
##########
@@ -4343,6 +4343,32 @@ cdef class ExtensionArray(Array):
return result
+class JsonArray(ExtensionArray):
+ """
+ Concrete class for Arrow arrays of JSON data type.
+ This does not guarantee that the JSON data actually
+ valid JSON.
Review Comment:
```suggestion
is valid JSON.
```
(missing verb?)
##########
python/pyarrow/types.pxi:
##########
@@ -5236,6 +5273,43 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def json_(DataType storage_type=utf8()):
+ """
+ Create instance of JSON extension type.
+
+ Parameters
+ ----------
+ storage_type : DataType
Review Comment:
```suggestion
storage_type : DataType, default pyarrow.string()
```
##########
python/pyarrow/tests/parquet/test_data_types.py:
##########
@@ -510,3 +510,16 @@ def test_large_binary_overflow():
pa.ArrowInvalid,
match="Parquet cannot store strings with size 2GB or more"):
_write_table(table, writer, use_dictionary=use_dictionary)
+
+
[email protected]("storage_type", (
+ pa.utf8(), pa.large_utf8(), pa.string(), pa.large_string()))
Review Comment:
```suggestion
pa.string(), pa.large_string()))
```
No need to parametrize with both ut8 and string, as they are just aliases
##########
python/pyarrow/scalar.pxi:
##########
@@ -1044,6 +1044,15 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
+class JsonScalar(ExtensionScalar):
+ """
+ Concrete class for JSON extension scalar.
+ """
+
+ def as_py(self):
+ return None if self.value is None else self.value.as_py()
Review Comment:
Is this actually needed? (that's what the base class already does?
##########
python/pyarrow/types.pxi:
##########
@@ -5236,6 +5273,43 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def json_(DataType storage_type=utf8()):
+ """
+ Create instance of JSON extension type.
+
+ Parameters
+ ----------
+ storage_type : DataType
+ The underlying data type.
Review Comment:
Maybe mention here that this should be one of the string data types? (also,
do you get a proper error message when passing a non-string data type? Can you
test that?)
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1926,3 +1926,50 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None
+
+
[email protected]("storage_type", (
+ pa.utf8(), pa.large_utf8(), pa.string_view(), pa.string(),
pa.large_string()))
Review Comment:
```suggestion
pa.string(), pa.large_string(), pa.string_view()))
```
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1926,3 +1926,50 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None
+
+
[email protected]("storage_type", (
+ pa.utf8(), pa.large_utf8(), pa.string_view(), pa.string(),
pa.large_string()))
+def test_json(storage_type, pickle_module):
+ data = ['{"a": 1}', '{"b": 2}', None]
+ storage = pa.array(data, type=storage_type)
+ json_type = pa.json_(storage_type)
+ json_arr_class = json_type.__arrow_ext_class__()
+
+ assert pa.json_() == pa.json_(pa.utf8())
Review Comment:
So this means that the storage type is not taken into account for checking
equality of the JSON extension type? Should it?
##########
python/pyarrow/types.pxi:
##########
@@ -1774,6 +1774,43 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar
+cdef class JsonType(BaseExtensionType):
+ """
+ Concrete class for Arrow arrays of JSON data type.
Review Comment:
This is not the class for the array but for the type
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]