This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new bcb4653c63 GH-44066: [Python] Add Python wrapper for JsonExtensionType 
(#44070)
bcb4653c63 is described below

commit bcb4653c6387a2b22df52a3bbc91317607abdccc
Author: Rok Mihevc <[email protected]>
AuthorDate: Tue Oct 22 13:56:59 2024 +0200

    GH-44066: [Python] Add Python wrapper for JsonExtensionType (#44070)
    
    ### Rationale for this change
    
    We [added canonical 
JsonExtensionType](https://github.com/apache/arrow/pull/13901) and we should 
make it usable from Python.
    
    ### What changes are included in this PR?
    
    Python wrapper for `JsonExtensionType` and `JsonArray` are added on Python 
side as well as `JsonArray` on c++ side.
    
    ### Are these changes tested?
    
    Python tests for the extension type and array are included.
    
    ### Are there any user-facing changes?
    
    This adds a json canonical extension type to pyarrow.
    * GitHub Issue: #44066
    
    Lead-authored-by: Rok Mihevc <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/__init__.py                      |  8 +--
 python/pyarrow/array.pxi                        | 27 +++++++++
 python/pyarrow/includes/libarrow.pxd            |  7 +++
 python/pyarrow/lib.pxd                          |  5 ++
 python/pyarrow/public-api.pxi                   |  2 +
 python/pyarrow/scalar.pxi                       |  6 ++
 python/pyarrow/tests/parquet/test_data_types.py | 11 ++++
 python/pyarrow/tests/test_extension_type.py     | 53 +++++++++++++++++
 python/pyarrow/tests/test_misc.py               |  3 +
 python/pyarrow/types.pxi                        | 75 +++++++++++++++++++++++++
 10 files changed, 193 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index fb7c242187..8c8c09265d 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -172,7 +172,7 @@ from pyarrow.lib import (null, bool_,
                          union, sparse_union, dense_union,
                          dictionary,
                          run_end_encoded,
-                         bool8, fixed_shape_tensor, opaque, uuid,
+                         bool8, fixed_shape_tensor, json_, opaque, uuid,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -183,7 +183,7 @@ from pyarrow.lib import (null, bool_,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
                          RunEndEncodedType, Bool8Type, FixedShapeTensorType,
-                         OpaqueType, UuidType,
+                         JsonType, OpaqueType, UuidType,
                          PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
@@ -218,7 +218,7 @@ from pyarrow.lib import (null, bool_,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, 
ExtensionArray,
                          RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
-                         OpaqueArray, UuidArray,
+                         JsonArray, OpaqueArray, UuidArray,
                          scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
@@ -236,7 +236,7 @@ from pyarrow.lib import (null, bool_,
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
                          RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
-                         FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
+                         FixedShapeTensorScalar, JsonScalar, OpaqueScalar, 
UuidScalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ae9e7fd777..eaedbf1e38 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -4344,6 +4344,33 @@ cdef class ExtensionArray(Array):
         return result
 
 
+class JsonArray(ExtensionArray):
+    """
+    Concrete class for Arrow arrays of JSON data type.
+
+    This does not guarantee that the JSON data actually
+    is valid JSON.
+
+    Examples
+    --------
+    Define the extension type for JSON array
+
+    >>> import pyarrow as pa
+    >>> json_type = pa.json_(pa.large_utf8())
+
+    Create an extension array
+
+    >>> arr = [None, '{ "id":30, "values":["a", "b"] }']
+    >>> storage = pa.array(arr, pa.large_utf8())
+    >>> pa.ExtensionArray.from_storage(json_type, storage)
+    <pyarrow.lib.JsonArray object at ...>
+    [
+      null,
+      "{ "id":30, "values":["a", "b"] }"
+    ]
+    """
+
+
 class UuidArray(ExtensionArray):
     """
     Concrete class for Arrow arrays of UUID data type.
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index d304641e0f..a70cb91873 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2871,6 +2871,13 @@ cdef extern from "arrow/extension_type.h" namespace 
"arrow":
         shared_ptr[CArray] storage()
 
 
+cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil:
+    cdef cppclass CJsonType" 
arrow::extension::JsonExtensionType"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& 
storage_type)
+
+
 cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
     cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 25a7945dc3..f3d4e1eec0 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -226,6 +226,11 @@ cdef class UuidType(BaseExtensionType):
     cdef:
         const CUuidType* uuid_ext_type
 
+cdef class JsonType(BaseExtensionType):
+    cdef:
+        const CJsonType* json_ext_type
+
+
 cdef class PyExtensionType(ExtensionType):
     pass
 
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index d3e2ff2e99..913e25e308 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type(
             out = OpaqueType.__new__(OpaqueType)
         elif extension_name == b"arrow.uuid":
             out = UuidType.__new__(UuidType)
+        elif extension_name == b"arrow.json":
+            out = JsonType.__new__(JsonType)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 68f77832c4..2bfdcddf30 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1044,6 +1044,12 @@ cdef class ExtensionScalar(Scalar):
         return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
 
 
+class JsonScalar(ExtensionScalar):
+    """
+    Concrete class for JSON extension scalar.
+    """
+
+
 class UuidScalar(ExtensionScalar):
     """
     Concrete class for Uuid extension scalar.
diff --git a/python/pyarrow/tests/parquet/test_data_types.py 
b/python/pyarrow/tests/parquet/test_data_types.py
index 79dd969482..1428f80239 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -510,3 +510,14 @@ def test_large_binary_overflow():
                 pa.ArrowInvalid,
                 match="Parquet cannot store strings with size 2GB or more"):
             _write_table(table, writer, use_dictionary=use_dictionary)
+
+
[email protected]("storage_type", (
+    pa.string(), pa.large_string()))
+def test_json_extension_type(storage_type):
+    data = ['{"a": 1}', '{"b": 2}', None]
+    arr = pa.array(data, type=pa.json_(storage_type))
+
+    table = pa.table([arr], names=["ext"])
+
+    _simple_table_roundtrip(table)
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index b74eca75bd..634d9ce2d8 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1926,3 +1926,56 @@ def test_bool8_scalar():
     assert pa.scalar(1, type=pa.bool8()).as_py() is True
     assert pa.scalar(2, type=pa.bool8()).as_py() is True
     assert pa.scalar(None, type=pa.bool8()).as_py() is None
+
+
[email protected]("storage_type", (
+    pa.string(), pa.large_string(), pa.string_view()))
+def test_json(storage_type, pickle_module):
+    data = ['{"a": 1}', '{"b": 2}', None]
+    json_type = pa.json_(storage_type)
+    storage = pa.array(data, type=storage_type)
+    array = pa.array(data, type=json_type)
+    json_arr_class = json_type.__arrow_ext_class__()
+
+    assert pa.json_() == pa.json_(pa.utf8())
+    assert json_type.extension_name == "arrow.json"
+    assert json_type.storage_type == storage_type
+    assert json_type.__class__ is pa.JsonType
+
+    assert json_type == pa.json_(storage_type)
+    assert json_type != storage_type
+
+    assert isinstance(array, pa.JsonArray)
+
+    assert array.to_pylist() == data
+    assert array[0].as_py() == data[0]
+    assert array[2].as_py() is None
+
+    # Pickle roundtrip
+    result = pickle_module.loads(pickle_module.dumps(json_type))
+    assert result == json_type
+
+    # IPC roundtrip
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"]))
+    batch = ipc_read_batch(buf)
+    reconstructed_array = batch.column(0)
+    assert reconstructed_array.type == json_type
+    assert reconstructed_array == array
+    assert isinstance(array, json_arr_class)
+
+    assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
+    assert isinstance(array[0], pa.JsonScalar)
+
+    # cast storage -> extension type
+    result = storage.cast(json_type)
+    assert result == array
+
+    # cast extension type -> storage type
+    inner = array.cast(storage_type)
+    assert inner == storage
+
+    for storage_type in (pa.int32(), pa.large_binary(), pa.float32()):
+        with pytest.raises(
+                pa.ArrowInvalid,
+                match=f"Invalid storage type for JsonExtensionType: 
{storage_type}"):
+            pa.json_(storage_type)
diff --git a/python/pyarrow/tests/test_misc.py 
b/python/pyarrow/tests/test_misc.py
index 5d3471c7c3..0b2055018f 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows():
     pa.Bool8Array,
     pa.Bool8Scalar,
     pa.Bool8Type,
+    pa.JsonArray,
+    pa.JsonScalar,
+    pa.JsonType,
 ])
 def test_extension_type_constructor_errors(klass):
     # ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 70f12e9796..c66ac5f28d 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType):
         return ExtensionScalar
 
 
+cdef class JsonType(BaseExtensionType):
+    """
+    Concrete class for JSON extension type.
+
+    Examples
+    --------
+    Define the extension type for JSON array
+
+    >>> import pyarrow as pa
+    >>> json_type = pa.json_(pa.large_utf8())
+
+    Create an extension array
+
+    >>> arr = [None, '{ "id":30, "values":["a", "b"] }']
+    >>> storage = pa.array(arr, pa.large_utf8())
+    >>> pa.ExtensionArray.from_storage(json_type, storage)
+    <pyarrow.lib.JsonArray object at ...>
+    [
+      null,
+      "{ "id":30, "values":["a", "b"] }"
+    ]
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.json_ext_type = <const CJsonType*> type.get()
+
+    def __arrow_ext_class__(self):
+        return JsonArray
+
+    def __reduce__(self):
+        return json_, (self.storage_type,)
+
+    def __arrow_ext_scalar_class__(self):
+        return JsonScalar
+
+
 cdef class UuidType(BaseExtensionType):
     """
     Concrete class for UUID extension type.
@@ -5296,6 +5333,44 @@ def run_end_encoded(run_end_type, value_type):
     return pyarrow_wrap_data_type(ree_type)
 
 
+def json_(DataType storage_type=utf8()):
+    """
+    Create instance of JSON extension type.
+
+    Parameters
+    ----------
+    storage_type : DataType, default pyarrow.string()
+        The underlying data type. Can be on of the following types:
+        string, large_string, string_view.
+
+    Returns
+    -------
+    type : JsonType
+
+    Examples
+    --------
+    Create an instance of JSON extension type:
+
+    >>> import pyarrow as pa
+    >>> pa.json_(pa.utf8())
+    JsonType(extension<arrow.json>)
+
+    Use the JSON type to create an array:
+
+    >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8()))
+    <pyarrow.lib.JsonArray object at ...>
+    [
+      "{"a": 1}",
+      "{"b": 2}"
+    ]
+    """
+
+    cdef JsonType out = JsonType.__new__(JsonType)
+    c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type))
+    out.init(c_json_ext_type)
+    return out
+
+
 def uuid():
     """
     Create UuidType instance.

Reply via email to