This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new bcb4653c63 GH-44066: [Python] Add Python wrapper for JsonExtensionType
(#44070)
bcb4653c63 is described below
commit bcb4653c6387a2b22df52a3bbc91317607abdccc
Author: Rok Mihevc <[email protected]>
AuthorDate: Tue Oct 22 13:56:59 2024 +0200
GH-44066: [Python] Add Python wrapper for JsonExtensionType (#44070)
### Rationale for this change
We [added canonical
JsonExtensionType](https://github.com/apache/arrow/pull/13901) and we should
make it usable from Python.
### What changes are included in this PR?
Python wrapper for `JsonExtensionType` and `JsonArray` are added on Python
side as well as `JsonArray` on c++ side.
### Are these changes tested?
Python tests for the extension type and array are included.
### Are there any user-facing changes?
This adds a json canonical extension type to pyarrow.
* GitHub Issue: #44066
Lead-authored-by: Rok Mihevc <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/__init__.py | 8 +--
python/pyarrow/array.pxi | 27 +++++++++
python/pyarrow/includes/libarrow.pxd | 7 +++
python/pyarrow/lib.pxd | 5 ++
python/pyarrow/public-api.pxi | 2 +
python/pyarrow/scalar.pxi | 6 ++
python/pyarrow/tests/parquet/test_data_types.py | 11 ++++
python/pyarrow/tests/test_extension_type.py | 53 +++++++++++++++++
python/pyarrow/tests/test_misc.py | 3 +
python/pyarrow/types.pxi | 75 +++++++++++++++++++++++++
10 files changed, 193 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index fb7c242187..8c8c09265d 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -172,7 +172,7 @@ from pyarrow.lib import (null, bool_,
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
- bool8, fixed_shape_tensor, opaque, uuid,
+ bool8, fixed_shape_tensor, json_, opaque, uuid,
field,
type_for_alias,
DataType, DictionaryType, StructType,
@@ -183,7 +183,7 @@ from pyarrow.lib import (null, bool_,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, Bool8Type, FixedShapeTensorType,
- OpaqueType, UuidType,
+ JsonType, OpaqueType, UuidType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
@@ -218,7 +218,7 @@ from pyarrow.lib import (null, bool_,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray,
ExtensionArray,
RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
- OpaqueArray, UuidArray,
+ JsonArray, OpaqueArray, UuidArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
@@ -236,7 +236,7 @@ from pyarrow.lib import (null, bool_,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
- FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
+ FixedShapeTensorScalar, JsonScalar, OpaqueScalar,
UuidScalar)
# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ae9e7fd777..eaedbf1e38 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -4344,6 +4344,33 @@ cdef class ExtensionArray(Array):
return result
+class JsonArray(ExtensionArray):
+ """
+ Concrete class for Arrow arrays of JSON data type.
+
+ This does not guarantee that the JSON data actually
+ is valid JSON.
+
+ Examples
+ --------
+ Define the extension type for JSON array
+
+ >>> import pyarrow as pa
+ >>> json_type = pa.json_(pa.large_utf8())
+
+ Create an extension array
+
+ >>> arr = [None, '{ "id":30, "values":["a", "b"] }']
+ >>> storage = pa.array(arr, pa.large_utf8())
+ >>> pa.ExtensionArray.from_storage(json_type, storage)
+ <pyarrow.lib.JsonArray object at ...>
+ [
+ null,
+ "{ "id":30, "values":["a", "b"] }"
+ ]
+ """
+
+
class UuidArray(ExtensionArray):
"""
Concrete class for Arrow arrays of UUID data type.
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index d304641e0f..a70cb91873 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2871,6 +2871,13 @@ cdef extern from "arrow/extension_type.h" namespace
"arrow":
shared_ptr[CArray] storage()
+cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil:
+ cdef cppclass CJsonType"
arrow::extension::JsonExtensionType"(CExtensionType):
+
+ @staticmethod
+ CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]&
storage_type)
+
+
cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 25a7945dc3..f3d4e1eec0 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -226,6 +226,11 @@ cdef class UuidType(BaseExtensionType):
cdef:
const CUuidType* uuid_ext_type
+cdef class JsonType(BaseExtensionType):
+ cdef:
+ const CJsonType* json_ext_type
+
+
cdef class PyExtensionType(ExtensionType):
pass
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index d3e2ff2e99..913e25e308 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type(
out = OpaqueType.__new__(OpaqueType)
elif extension_name == b"arrow.uuid":
out = UuidType.__new__(UuidType)
+ elif extension_name == b"arrow.json":
+ out = JsonType.__new__(JsonType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 68f77832c4..2bfdcddf30 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1044,6 +1044,12 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
+class JsonScalar(ExtensionScalar):
+ """
+ Concrete class for JSON extension scalar.
+ """
+
+
class UuidScalar(ExtensionScalar):
"""
Concrete class for Uuid extension scalar.
diff --git a/python/pyarrow/tests/parquet/test_data_types.py
b/python/pyarrow/tests/parquet/test_data_types.py
index 79dd969482..1428f80239 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -510,3 +510,14 @@ def test_large_binary_overflow():
pa.ArrowInvalid,
match="Parquet cannot store strings with size 2GB or more"):
_write_table(table, writer, use_dictionary=use_dictionary)
+
+
[email protected]("storage_type", (
+ pa.string(), pa.large_string()))
+def test_json_extension_type(storage_type):
+ data = ['{"a": 1}', '{"b": 2}', None]
+ arr = pa.array(data, type=pa.json_(storage_type))
+
+ table = pa.table([arr], names=["ext"])
+
+ _simple_table_roundtrip(table)
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index b74eca75bd..634d9ce2d8 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1926,3 +1926,56 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None
+
+
[email protected]("storage_type", (
+ pa.string(), pa.large_string(), pa.string_view()))
+def test_json(storage_type, pickle_module):
+ data = ['{"a": 1}', '{"b": 2}', None]
+ json_type = pa.json_(storage_type)
+ storage = pa.array(data, type=storage_type)
+ array = pa.array(data, type=json_type)
+ json_arr_class = json_type.__arrow_ext_class__()
+
+ assert pa.json_() == pa.json_(pa.utf8())
+ assert json_type.extension_name == "arrow.json"
+ assert json_type.storage_type == storage_type
+ assert json_type.__class__ is pa.JsonType
+
+ assert json_type == pa.json_(storage_type)
+ assert json_type != storage_type
+
+ assert isinstance(array, pa.JsonArray)
+
+ assert array.to_pylist() == data
+ assert array[0].as_py() == data[0]
+ assert array[2].as_py() is None
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(json_type))
+ assert result == json_type
+
+ # IPC roundtrip
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"]))
+ batch = ipc_read_batch(buf)
+ reconstructed_array = batch.column(0)
+ assert reconstructed_array.type == json_type
+ assert reconstructed_array == array
+ assert isinstance(array, json_arr_class)
+
+ assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
+ assert isinstance(array[0], pa.JsonScalar)
+
+ # cast storage -> extension type
+ result = storage.cast(json_type)
+ assert result == array
+
+ # cast extension type -> storage type
+ inner = array.cast(storage_type)
+ assert inner == storage
+
+ for storage_type in (pa.int32(), pa.large_binary(), pa.float32()):
+ with pytest.raises(
+ pa.ArrowInvalid,
+ match=f"Invalid storage type for JsonExtensionType:
{storage_type}"):
+ pa.json_(storage_type)
diff --git a/python/pyarrow/tests/test_misc.py
b/python/pyarrow/tests/test_misc.py
index 5d3471c7c3..0b2055018f 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows():
pa.Bool8Array,
pa.Bool8Scalar,
pa.Bool8Type,
+ pa.JsonArray,
+ pa.JsonScalar,
+ pa.JsonType,
])
def test_extension_type_constructor_errors(klass):
# ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 70f12e9796..c66ac5f28d 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar
+cdef class JsonType(BaseExtensionType):
+ """
+ Concrete class for JSON extension type.
+
+ Examples
+ --------
+ Define the extension type for JSON array
+
+ >>> import pyarrow as pa
+ >>> json_type = pa.json_(pa.large_utf8())
+
+ Create an extension array
+
+ >>> arr = [None, '{ "id":30, "values":["a", "b"] }']
+ >>> storage = pa.array(arr, pa.large_utf8())
+ >>> pa.ExtensionArray.from_storage(json_type, storage)
+ <pyarrow.lib.JsonArray object at ...>
+ [
+ null,
+ "{ "id":30, "values":["a", "b"] }"
+ ]
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ BaseExtensionType.init(self, type)
+ self.json_ext_type = <const CJsonType*> type.get()
+
+ def __arrow_ext_class__(self):
+ return JsonArray
+
+ def __reduce__(self):
+ return json_, (self.storage_type,)
+
+ def __arrow_ext_scalar_class__(self):
+ return JsonScalar
+
+
cdef class UuidType(BaseExtensionType):
"""
Concrete class for UUID extension type.
@@ -5296,6 +5333,44 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def json_(DataType storage_type=utf8()):
+ """
+ Create instance of JSON extension type.
+
+ Parameters
+ ----------
+ storage_type : DataType, default pyarrow.string()
+ The underlying data type. Can be on of the following types:
+ string, large_string, string_view.
+
+ Returns
+ -------
+ type : JsonType
+
+ Examples
+ --------
+ Create an instance of JSON extension type:
+
+ >>> import pyarrow as pa
+ >>> pa.json_(pa.utf8())
+ JsonType(extension<arrow.json>)
+
+ Use the JSON type to create an array:
+
+ >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8()))
+ <pyarrow.lib.JsonArray object at ...>
+ [
+ "{"a": 1}",
+ "{"b": 2}"
+ ]
+ """
+
+ cdef JsonType out = JsonType.__new__(JsonType)
+ c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type))
+ out.init(c_json_ext_type)
+ return out
+
+
def uuid():
"""
Create UuidType instance.