jorgecarleitao commented on a change in pull request #8401:
URL: https://github.com/apache/arrow/pull/8401#discussion_r523763928



##########
File path: rust/arrow/src/ffi.rs
##########
@@ -0,0 +1,657 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains declarations to bind to the [C Data 
Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
+//!
+//! Generally, this module is divided in two main interfaces:
+//! One interface maps C ABI to native Rust types, i.e. convert c-pointers, 
c_char, to native rust.
+//! This is handled by [FFI_ArrowSchema] and [FFI_ArrowArray].
+//!
+//! The second interface maps native Rust types to the Rust-specific 
implementation of Arrow such as `format` to [Datatype],
+//! `Buffer`, etc. This is handled by [ArrowArray].
+//!
+//! ```rust
+//! # use std::sync::Arc;
+//! # use arrow::array::{Int32Array, Array, ArrayData, make_array_from_raw};
+//! # use arrow::error::{Result, ArrowError};
+//! # use arrow::compute::kernels::arithmetic;
+//! # use std::convert::TryFrom;
+//! # fn main() -> Result<()> {
+//! // create an array natively
+//! let array = Int32Array::from(vec![Some(1), None, Some(3)]);
+//!
+//! // export it
+//! let (array_ptr, schema_ptr) = array.to_raw()?;
+//!
+//! // consumed and used by something else...
+//!
+//! // import it
+//! let array = unsafe { make_array_from_raw(array_ptr, schema_ptr)? };
+//!
+//! // perform some operation
+//! let array = array.as_any().downcast_ref::<Int32Array>().ok_or(
+//!     ArrowError::ParseError("Expects an int32".to_string()),
+//! )?;
+//! let array = arithmetic::add(&array, &array)?;
+//!
+//! // verify
+//! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)]));
+//!
+//! // (drop/release)
+//! Ok(())
+//! }
+//! ```
+
+/*
+# Design:
+
+Main assumptions:
+* A memory region is deallocated according it its own release mechanism.
+* Rust shares memory regions between arrays.
+* A memory region should be deallocated when no-one is using it.
+
+The design of this module is as follows:
+
+`ArrowArray` contains two `Arc`s, one per ABI-compatible `struct`, each 
containing data
+according to the C Data Interface. These Arcs are used for ref counting of the 
structs
+within Rust and lifetime management.
+
+Each ABI-compatible `struct` knowns how to `drop` itself, calling `release`.
+
+To import an array, unsafely create an `ArrowArray` from two pointers using 
[ArrowArray::try_from_raw].
+To export an array, create an `ArrowArray` using [ArrowArray::try_new].
+*/
+
+use std::{ffi::CStr, ffi::CString, iter, mem::size_of, ptr, sync::Arc};
+
+use crate::buffer::Buffer;
+use crate::datatypes::DataType;
+use crate::error::{ArrowError, Result};
+use crate::util::bit_util;
+
+/// ABI-compatible struct for `ArrowSchema` from C Data Interface
+/// See 
https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
+/// This was created by bindgen
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_ArrowSchema {
+    format: *const ::std::os::raw::c_char,
+    name: *const ::std::os::raw::c_char,
+    metadata: *const ::std::os::raw::c_char,
+    flags: i64,
+    n_children: i64,
+    children: *mut *mut FFI_ArrowSchema,
+    dictionary: *mut FFI_ArrowSchema,
+    release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut 
FFI_ArrowSchema)>,
+    private_data: *mut ::std::os::raw::c_void,
+}
+
+// callback used to drop [FFI_ArrowSchema] when it is exported.
+unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
+    let schema = &mut *schema;
+
+    // take ownership back to release it.
+    CString::from_raw(schema.format as *mut std::os::raw::c_char);
+
+    schema.release = None;
+}
+
+impl FFI_ArrowSchema {
+    /// create a new [FFI_ArrowSchema] from a format.
+    fn new(format: &str) -> FFI_ArrowSchema {
+        // 
https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema
+        FFI_ArrowSchema {
+            format: CString::new(format).unwrap().into_raw(),
+            name: std::ptr::null_mut(),
+            metadata: std::ptr::null_mut(),
+            flags: 0,
+            n_children: 0,
+            children: ptr::null_mut(),
+            dictionary: std::ptr::null_mut(),
+            release: Some(release_schema),
+            private_data: std::ptr::null_mut(),
+        }
+    }
+
+    /// create an empty [FFI_ArrowSchema]
+    fn empty() -> Self {
+        Self {
+            format: std::ptr::null_mut(),
+            name: std::ptr::null_mut(),
+            metadata: std::ptr::null_mut(),
+            flags: 0,
+            n_children: 0,
+            children: ptr::null_mut(),
+            dictionary: std::ptr::null_mut(),
+            release: None,
+            private_data: std::ptr::null_mut(),
+        }
+    }
+
+    /// returns the format of this schema.
+    pub fn format(&self) -> &str {
+        unsafe { CStr::from_ptr(self.format) }
+            .to_str()
+            .expect("The external API has a non-utf8 as format")
+    }
+}
+
+impl Drop for FFI_ArrowSchema {
+    fn drop(&mut self) {
+        match self.release {
+            None => (),
+            Some(release) => unsafe { release(self) },
+        };
+    }
+}
+
+/// maps a DataType `format` to a [DataType](arrow::datatypes::DataType).
+/// See 
https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
+fn to_datatype(format: &str) -> Result<DataType> {
+    Ok(match format {
+        "n" => DataType::Null,
+        "b" => DataType::Boolean,
+        "c" => DataType::Int8,
+        "C" => DataType::UInt8,
+        "s" => DataType::Int16,
+        "S" => DataType::UInt16,
+        "i" => DataType::Int32,
+        "I" => DataType::UInt32,
+        "l" => DataType::Int64,
+        "L" => DataType::UInt64,
+        "e" => DataType::Float16,
+        "f" => DataType::Float32,
+        "g" => DataType::Float64,
+        "z" => DataType::Binary,
+        "Z" => DataType::LargeBinary,
+        "u" => DataType::Utf8,
+        "U" => DataType::LargeUtf8,
+        _ => {
+            return Err(ArrowError::CDataInterface(
+                "The datatype \"{}\" is still not supported in Rust 
implementation"
+                    .to_string(),
+            ))
+        }
+    })
+}
+
+/// the inverse of [to_datatype]
+fn from_datatype(datatype: &DataType) -> Result<String> {
+    Ok(match datatype {
+        DataType::Null => "n",
+        DataType::Boolean => "b",
+        DataType::Int8 => "c",
+        DataType::UInt8 => "C",
+        DataType::Int16 => "s",
+        DataType::UInt16 => "S",
+        DataType::Int32 => "i",
+        DataType::UInt32 => "I",
+        DataType::Int64 => "l",
+        DataType::UInt64 => "L",
+        DataType::Float16 => "e",
+        DataType::Float32 => "f",
+        DataType::Float64 => "g",
+        DataType::Binary => "z",
+        DataType::LargeBinary => "Z",
+        DataType::Utf8 => "u",
+        DataType::LargeUtf8 => "U",
+        _ => {
+            return Err(ArrowError::CDataInterface(
+                "The datatype \"{:?}\" is still not supported in Rust 
implementation"
+                    .to_string(),
+            ))
+        }
+    }
+    .to_string())
+}
+
+// returns the number of bits that buffer `i` (in the C data interface) is 
expected to have.
+// This is set by the Arrow specification
+fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
+    Ok(match (data_type, i) {
+        // the null buffer is bit sized
+        (_, 0) => 1,
+        // primitive types first buffer's size is given by the native types
+        (DataType::Boolean, 1) => 1,
+        (DataType::UInt8, 1) => size_of::<u8>() * 8,
+        (DataType::UInt16, 1) => size_of::<u16>() * 8,
+        (DataType::UInt32, 1) => size_of::<u32>() * 8,
+        (DataType::UInt64, 1) => size_of::<u64>() * 8,
+        (DataType::Int8, 1) => size_of::<i8>() * 8,
+        (DataType::Int16, 1) => size_of::<i16>() * 8,
+        (DataType::Int32, 1) => size_of::<i32>() * 8,
+        (DataType::Int64, 1) => size_of::<i64>() * 8,
+        (DataType::Float32, 1) => size_of::<f32>() * 8,
+        (DataType::Float64, 1) => size_of::<f64>() * 8,
+        // primitive types have a single buffer
+        (DataType::Boolean, _) |
+        (DataType::UInt8, _) |
+        (DataType::UInt16, _) |
+        (DataType::UInt32, _) |
+        (DataType::UInt64, _) |
+        (DataType::Int8, _) |
+        (DataType::Int16, _) |
+        (DataType::Int32, _) |
+        (DataType::Int64, _) |
+        (DataType::Float32, _) |
+        (DataType::Float64, _) => {
+            return Err(ArrowError::CDataInterface(format!(
+                "The datatype \"{:?}\" expects 2 buffers, but requested {}. 
Please verify that the C data interface is correctly implemented.",
+                data_type, i
+            )))
+        }
+        // Variable-sized binaries: have two buffers.
+        // Utf8: first buffer is i32, second is in bytes
+        (DataType::Utf8, 1) => size_of::<i32>() * 8,
+        (DataType::Utf8, 2) => size_of::<u8>() * 8,
+        (DataType::Utf8, _) => {
+            return Err(ArrowError::CDataInterface(format!(
+                "The datatype \"{:?}\" expects 3 buffers, but requested {}. 
Please verify that the C data interface is correctly implemented.",

Review comment:
       The zero'th, in [this 
line](https://github.com/apache/arrow/pull/8401/files/3429c15b418aa383fd307a178c7fd16e497bd902#diff-539f116862a6cea16ae65b6a031927a23fb3da6a1ee0223d517215fc83bf4a7aR227),
 which corresponds to the null bitmap (as per spec), the 1st and the 2nd => 3.
   
   In rust, the zeroth null buffer is treated separately than the rest, so I 
just placed a separate match specifically for that one.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to