WillAyd opened a new issue, #375:
URL: https://github.com/apache/arrow-nanoarrow/issues/375
### Describe the bug, including details regarding any error messages,
version, and platform.
I am trying to work with interval data passed along the new pycapsule
interface. I noticed that this seems to work fine in the python-space:
```python
import pyarrow as pa
schema = pa.schema([("interval", pa.month_day_nano_interval())])
tbl = pa.Table.from_arrays([pa.array(
[
None,
pa.scalar((1, 1, 1), type=pa.month_day_nano_interval()),
pa.scalar((42, 42, 42), type=pa.month_day_nano_interval()),
None,
]
)], schema=schema)
capsule = tbl.__arrow_c_stream__()
stream = pa.RecordBatchReader._import_from_c_capsule(capsule)
new = stream.read_all()
tbl == new # True
```
However, when trying to read a capsule created in an extension via nanoarrow
that provides an equivalent array I am getting unexpected results. Assuming the
following extension built via nanoarrow:
<details>
```cpp
#include <nanoarrow/nanoarrow.hpp>
#include <nanobind/nanobind.h>
namespace nb = nanobind;
static auto releaseArrowStream(void *ptr) noexcept -> void {
auto stream = static_cast<ArrowArrayStream *>(ptr);
if (stream->release != nullptr) {
ArrowArrayStreamRelease(stream);
}
}
auto get_interval_capsule() -> nb::capsule {
nanoarrow::UniqueSchema schema;
ArrowSchemaInit(schema.get());
if (ArrowSchemaSetTypeStruct(schema.get(), 1)) {
throw std::runtime_error("ArrowSchemaSetTypeStruct failed");
}
if (ArrowSchemaSetType(schema->children[0],
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO)) {
throw std::runtime_error("ArrowSchemaSetType failed");
}
nanoarrow::UniqueArray array;
if (ArrowArrayInitFromSchema(array.get(), schema.get(), nullptr)) {
throw std::runtime_error("ArrowSchemaInitFromSchema failed");
}
if (ArrowSchemaSetName(schema->children[0], "interval_column")) {
throw std::runtime_error("ArrowSchemaSetName failed");
}
if (ArrowArrayStartAppending(array.get())) {
throw std::runtime_error("ArrowArrayStartAppending failed");
}
struct ArrowInterval interval;
ArrowIntervalInit(&interval, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO);
// row 0
ArrowArrayAppendNull(array->children[0], 1);
if (ArrowArrayFinishElement(array.get())) {
throw std::runtime_error("ArrowArrayFinishElement failed");
}
// row 1
interval.months = 1;
interval.days = 1;
interval.ns = 1;
if (ArrowArrayAppendInterval(array->children[0], &interval)) {
throw std::runtime_error("Failed to append interval value");
}
if (ArrowArrayFinishElement(array.get())) {
throw std::runtime_error("ArrowArrayFinishElement failed");
}
// row 2
interval.months = 42;
interval.days = 42;
interval.ns = 42;
if (ArrowArrayAppendInterval(array->children[0], &interval)) {
throw std::runtime_error("Failed to append interval value");
}
if (ArrowArrayFinishElement(array.get())) {
throw std::runtime_error("ArrowArrayFinishElement failed");
}
// row 3
ArrowArrayAppendNull(array->children[0], 1);
if (ArrowArrayFinishElement(array.get())) {
throw std::runtime_error("ArrowArrayFinishElement failed");
}
if (ArrowArrayFinishBuildingDefault(array.get(), nullptr)) {
throw std::runtime_error("ArrowArrayFinishBuildingDefault failed");
}
auto stream =
(struct ArrowArrayStream *)malloc(sizeof(struct ArrowArrayStream));
if (ArrowBasicArrayStreamInit(stream, schema.get(), 1)) {
free(stream);
throw std::runtime_error("ArrowBasicArrayStreamInit failed");
}
ArrowBasicArrayStreamSetArray(stream, 0, array.get());
return nb::capsule{stream, "arrow_array_stream", &releaseArrowStream};
}
NB_MODULE(nanoarrow_mre, m) {
m.def("get_interval_capsule", &get_interval_capsule);
}
```
</details>
Coupled with this CMake file to build the extension:
<details>
```cmake
cmake_minimum_required(VERSION 3.18)
project(nanoarrow_mre LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
if (MSVC)
else()
add_compile_options(-Wall -Wextra)
endif()
find_package(Python COMPONENTS Interpreter Development.Module NumPy REQUIRED)
# Detect the installed nanobind package and import it into CMake
execute_process(
COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
find_package(nanobind CONFIG REQUIRED)
include(FetchContent)
FetchContent_Declare(nanoarrow-project
GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
GIT_TAG b3c952a3e21c2b47df85dbede3444f852614a3e2
)
FetchContent_MakeAvailable(nanoarrow-project)
nanobind_add_module(nanoarrow_mre NOMINSIZE nanoarrow_mre.cpp)
target_link_libraries(nanoarrow_mre PRIVATE nanoarrow)
set_target_properties(nanoarrow
PROPERTIES POSITION_INDEPENDENT_CODE
ON)
```
</details>
I get rather strange results:
```python
import pyarrow as pa
import nanoarrow_mre
capsule = nanoarrow_mre.get_interval_capsule()
stream = pa.RecordBatchReader._import_from_c_capsule(capsule)
tbl = stream.read_all()
```
Here is what tbl ends up looking like:
```python
>>> tbl
pyarrow.Table
interval_column: month_day_nano_interval
----
interval_column: [[null,null,42M42d42ns,0M0d0ns]]
```
As you can see from the result, the nulls are misplaced and we have likely
lost the 1D1M1ns interval.
I don't _think_ this is an issue with nanoarrow - I haven't seen it in ADBC
and when inspecting the raw bytes I am seeing the expected data, so I _think_
it is specific to how the capsules are being read back into pyarrow
@jorisvandenbossche @paleolimbot
### Component(s)
Python
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]