WillAyd opened a new issue, #375:
URL: https://github.com/apache/arrow-nanoarrow/issues/375

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   I am trying to work with interval data passed along the new pycapsule 
interface. I noticed that this seems to work fine in the python-space:
   
   ```python
   import pyarrow as pa
   schema = pa.schema([("interval", pa.month_day_nano_interval())])
   tbl = pa.Table.from_arrays([pa.array(
       [
           None,
           pa.scalar((1, 1, 1), type=pa.month_day_nano_interval()),
           pa.scalar((42, 42, 42), type=pa.month_day_nano_interval()),
           None,
       ]
   )], schema=schema)
   capsule = tbl.__arrow_c_stream__()
   stream = pa.RecordBatchReader._import_from_c_capsule(capsule)
   new = stream.read_all()
   tbl == new  # True
   ```
   
   However, when trying to read a capsule created in an extension via nanoarrow 
that provides an equivalent array I am getting unexpected results. Assuming the 
following extension built via nanoarrow:
   
   <details>
   
   ```cpp
   #include <nanoarrow/nanoarrow.hpp>
   #include <nanobind/nanobind.h>
   
   namespace nb = nanobind;
   
   static auto releaseArrowStream(void *ptr) noexcept -> void {
     auto stream = static_cast<ArrowArrayStream *>(ptr);
     if (stream->release != nullptr) {
       ArrowArrayStreamRelease(stream);
     }
   }
   
   
   auto get_interval_capsule() -> nb::capsule {
     nanoarrow::UniqueSchema schema;
     ArrowSchemaInit(schema.get());
     if (ArrowSchemaSetTypeStruct(schema.get(), 1)) {
       throw std::runtime_error("ArrowSchemaSetTypeStruct failed");
     }
   
     if (ArrowSchemaSetType(schema->children[0], 
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO)) {
       throw std::runtime_error("ArrowSchemaSetType failed");    
     }
   
     nanoarrow::UniqueArray array;
     if (ArrowArrayInitFromSchema(array.get(), schema.get(), nullptr)) {
       throw std::runtime_error("ArrowSchemaInitFromSchema failed");
     }
     if (ArrowSchemaSetName(schema->children[0], "interval_column")) {
       throw std::runtime_error("ArrowSchemaSetName failed");
     }
   
     if (ArrowArrayStartAppending(array.get())) {
       throw std::runtime_error("ArrowArrayStartAppending failed");
     }
   
     struct ArrowInterval interval;
     ArrowIntervalInit(&interval, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO);
     
     // row 0
     ArrowArrayAppendNull(array->children[0], 1);
     if (ArrowArrayFinishElement(array.get())) {
       throw std::runtime_error("ArrowArrayFinishElement failed");
     }
   
     // row 1
     interval.months = 1;
     interval.days = 1;
     interval.ns = 1;
     if (ArrowArrayAppendInterval(array->children[0], &interval)) {
       throw std::runtime_error("Failed to append interval value");
     }
     if (ArrowArrayFinishElement(array.get())) {
       throw std::runtime_error("ArrowArrayFinishElement failed");
     }
   
     // row 2
     interval.months = 42;
     interval.days = 42;
     interval.ns = 42;
     if (ArrowArrayAppendInterval(array->children[0], &interval)) {
       throw std::runtime_error("Failed to append interval value");
     }
     if (ArrowArrayFinishElement(array.get())) {
       throw std::runtime_error("ArrowArrayFinishElement failed");
     }
   
     // row 3
     ArrowArrayAppendNull(array->children[0], 1);
     if (ArrowArrayFinishElement(array.get())) {
       throw std::runtime_error("ArrowArrayFinishElement failed");
     }
   
     if (ArrowArrayFinishBuildingDefault(array.get(), nullptr)) {
       throw std::runtime_error("ArrowArrayFinishBuildingDefault failed");
     }
   
     auto stream =
         (struct ArrowArrayStream *)malloc(sizeof(struct ArrowArrayStream));
     if (ArrowBasicArrayStreamInit(stream, schema.get(), 1)) {
       free(stream);
       throw std::runtime_error("ArrowBasicArrayStreamInit failed");
     }
     ArrowBasicArrayStreamSetArray(stream, 0, array.get());
     
     return nb::capsule{stream, "arrow_array_stream", &releaseArrowStream};
   }
   
   NB_MODULE(nanoarrow_mre, m) {
     m.def("get_interval_capsule", &get_interval_capsule);
   }
   ```
   
   </details>
   
   Coupled with this CMake file to build the extension:
   
   <details>
   
   ```cmake
   cmake_minimum_required(VERSION 3.18)
   project(nanoarrow_mre LANGUAGES CXX)
   set(CMAKE_CXX_STANDARD 17)
   set(CMAKE_CXX_STANDARD_REQUIRED ON)
   
   if (MSVC)
   else()
       add_compile_options(-Wall -Wextra)
   endif()
   
   find_package(Python COMPONENTS Interpreter Development.Module NumPy REQUIRED)
   
   # Detect the installed nanobind package and import it into CMake
   execute_process(
     COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
     OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
   list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
   find_package(nanobind CONFIG REQUIRED)
     
   
   include(FetchContent)
   FetchContent_Declare(nanoarrow-project
     GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
     GIT_TAG b3c952a3e21c2b47df85dbede3444f852614a3e2
   )
   FetchContent_MakeAvailable(nanoarrow-project)
   
   nanobind_add_module(nanoarrow_mre NOMINSIZE nanoarrow_mre.cpp)
   target_link_libraries(nanoarrow_mre PRIVATE nanoarrow)
   set_target_properties(nanoarrow
                         PROPERTIES POSITION_INDEPENDENT_CODE
                         ON)
   
   ```
   </details>
   
   I get rather strange results:
   
   ```python
   import pyarrow as pa
   import nanoarrow_mre
   capsule = nanoarrow_mre.get_interval_capsule()
   stream = pa.RecordBatchReader._import_from_c_capsule(capsule)
   tbl = stream.read_all()
   ```
   
   Here is what tbl ends up looking like:
   
   ```python
   >>> tbl
   pyarrow.Table
   interval_column: month_day_nano_interval
   ----
   interval_column: [[null,null,42M42d42ns,0M0d0ns]]
   ```
   
   As you can see from the result, the nulls are misplaced and we have likely 
lost the 1D1M1ns interval.
   
   I don't _think_ this is an issue with nanoarrow - I haven't seen it in ADBC 
and when inspecting the raw bytes I am seeing the expected data, so I _think_ 
it is specific to how the capsules are being read back into pyarrow
   
   @jorisvandenbossche @paleolimbot 
   
   ### Component(s)
   
   Python


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to