pitrou commented on code in PR #44070:
URL: https://github.com/apache/arrow/pull/44070#discussion_r1799975031
##########
python/pyarrow/types.pxi:
##########
@@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar
+cdef class JsonType(BaseExtensionType):
+ """
+ Concrete class for JSON extension type.
+
+ Examples
+ --------
+ Define the extension type for JSON array
+
+ >>> import pyarrow as pa
+ >>> json_type = pa.json_(pa.large_utf8())
+
+ Create an extension array
+
+ >>> arr = [None, '{ "id":30, "values":["a", "b"] }']
+ >>> storage = pa.array(arr, pa.large_utf8())
Review Comment:
Side note: it would be nice if one could write `json_array = pa.array(arr,
json_type)`.
Perhaps open a feature request?
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1926,3 +1926,57 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None
+
+
[email protected]("storage_type", (
+ pa.string(), pa.large_string(), pa.string_view()))
+def test_json(storage_type, pickle_module):
+ data = ['{"a": 1}', '{"b": 2}', None]
+ storage = pa.array(data, type=storage_type)
+ json_type = pa.json_(storage_type)
+ json_arr_class = json_type.__arrow_ext_class__()
+
+ assert pa.json_() == pa.json_(pa.utf8())
+ assert json_type.extension_name == "arrow.json"
+ assert json_type.storage_type == storage_type
+ assert json_type.__class__ is pa.JsonType
+
+ assert json_type == pa.json_(storage_type)
+ assert json_type != storage_type
+
+ array = pa.ExtensionArray.from_storage(json_type, storage)
+ assert isinstance(array, pa.JsonArray)
+
+ assert array.to_pylist() == data
+ assert array[0].as_py() == data[0]
+ assert array[2].as_py() is None
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(json_type))
+ assert result == json_type
+
+ # IPC roundtrip
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"]))
+ batch = ipc_read_batch(buf)
+ reconstructed_array = batch.column(0)
+ assert reconstructed_array.type == json_type
+ assert reconstructed_array == array
+ assert isinstance(array, json_arr_class)
+
+ assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
+ assert isinstance(array[0], pa.JsonScalar)
+
+ # cast storage -> extension type
+ result = storage.cast(json_type)
+ assert result == array
+
+ # cast extension type -> storage type
+ inner = array.cast(storage_type)
+ assert inner == storage
+
+ for storage_type in (pa.int32(), pa.large_binary(), pa.float32()):
+ with pytest.raises(
+ pa.ArrowInvalid,
Review Comment:
Pity this doesn't raise `TypeError`.
##########
python/pyarrow/types.pxi:
##########
@@ -5296,6 +5333,45 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def json_(DataType storage_type=utf8()):
+ """
+ Create instance of JSON extension type.
+
+ Parameters
+ ----------
+ storage_type : DataType, default pyarrow.string()
+ The underlying data type. Can be on of the following types:
+ string, large_string, string_view.
+
Review Comment:
One superfluous line here.
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1926,3 +1926,57 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None
+
+
[email protected]("storage_type", (
+ pa.string(), pa.large_string(), pa.string_view()))
+def test_json(storage_type, pickle_module):
+ data = ['{"a": 1}', '{"b": 2}', None]
+ storage = pa.array(data, type=storage_type)
+ json_type = pa.json_(storage_type)
+ json_arr_class = json_type.__arrow_ext_class__()
+
+ assert pa.json_() == pa.json_(pa.utf8())
+ assert json_type.extension_name == "arrow.json"
+ assert json_type.storage_type == storage_type
+ assert json_type.__class__ is pa.JsonType
+
+ assert json_type == pa.json_(storage_type)
+ assert json_type != storage_type
+
+ array = pa.ExtensionArray.from_storage(json_type, storage)
+ assert isinstance(array, pa.JsonArray)
+
+ assert array.to_pylist() == data
+ assert array[0].as_py() == data[0]
+ assert array[2].as_py() is None
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(json_type))
+ assert result == json_type
+
+ # IPC roundtrip
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"]))
+ batch = ipc_read_batch(buf)
+ reconstructed_array = batch.column(0)
+ assert reconstructed_array.type == json_type
+ assert reconstructed_array == array
+ assert isinstance(array, json_arr_class)
+
+ assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
+ assert isinstance(array[0], pa.JsonScalar)
+
+ # cast storage -> extension type
+ result = storage.cast(json_type)
+ assert result == array
+
+ # cast extension type -> storage type
+ inner = array.cast(storage_type)
+ assert inner == storage
+
+ for storage_type in (pa.int32(), pa.large_binary(), pa.float32()):
+ with pytest.raises(
+ pa.ArrowInvalid,
+ match="Invalid storage type for JsonExtensionType: " +
+ str(storage_type)):
Review Comment:
Nit: can use a f-string
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]