wjones127 commented on code in PR #478: URL: https://github.com/apache/arrow-adbc/pull/478#discussion_r1141477402
########## rust/src/info.rs: ########## @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for driver info +//! +//! For use with [crate::AdbcConnection::get_info]. + +use arrow_array::builder::{ + ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, ListBuilder, MapBuilder, + StringBuilder, UInt32BufferBuilder, UInt32Builder, UInt8BufferBuilder, +}; +use arrow_array::cast::{as_primitive_array, as_string_array, as_union_array}; +use arrow_array::types::UInt32Type; +use arrow_array::{Array, ArrayRef, UnionArray}; +use arrow_array::{RecordBatch, RecordBatchIterator, RecordBatchReader}; +use arrow_schema::{ArrowError, DataType, Field, Schema, UnionMode}; +use std::{borrow::Cow, collections::HashMap, sync::Arc}; + +/// Contains known info codes defined by ADBC. +pub mod codes { + /// The database vendor/product version (type: utf8). + pub const VENDOR_NAME: u32 = 0; + /// The database vendor/product version (type: utf8). + pub const VENDOR_VERSION: u32 = 1; + /// The database vendor/product Arrow library version (type: utf8). + pub const VENDOR_ARROW_VERSION: u32 = 2; + /// The driver name (type: utf8). + pub const DRIVER_NAME: u32 = 100; + /// The driver version (type: utf8). + pub const DRIVER_VERSION: u32 = 101; + /// The driver Arrow library version (type: utf8). + pub const DRIVER_ARROW_VERSION: u32 = 102; +} + +pub fn info_schema() -> Schema { + Schema::new(vec![ + Field::new("info_name", DataType::UInt32, false), + Field::new( + "info_value", + DataType::Union( + vec![ + Field::new("string_value", DataType::Utf8, true), + Field::new("bool_value", DataType::Boolean, true), + Field::new("int64_value", DataType::Int64, true), + Field::new("int32_bitmask", DataType::Int32, true), + Field::new( + "string_list", + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + true, + ), + Field::new( + "int32_to_int32_list_map", + DataType::Map( + Box::new(Field::new( + "entries", + DataType::Struct(vec![ + Field::new("keys", DataType::Int32, false), + Field::new( + "values", + DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + ))), + true, + ), + ]), + false, + )), + false, + ), + true, + ), + ], + vec![0, 1, 2, 3, 4, 5], + UnionMode::Dense, + ), + true, + ), + ]) +} + +/// Rust representations of database/drier metadata +#[derive(Clone, Debug, PartialEq)] +pub enum InfoData { + StringValue(Cow<'static, str>), + BoolValue(bool), + Int64Value(i64), + Int32Bitmask(i32), + StringList(Vec<String>), + Int32ToInt32ListMap(HashMap<i32, Vec<i32>>), +} + +pub fn export_info_data( + info_iter: impl IntoIterator<Item = (u32, InfoData)>, +) -> Box<dyn RecordBatchReader> { + let info_iter = info_iter.into_iter(); + + let mut codes = UInt32Builder::with_capacity(info_iter.size_hint().0); + + // Type id tells which array the value is in + let mut type_id = UInt8BufferBuilder::new(info_iter.size_hint().0); + // Value offset tells the offset of the value in the respective array + let mut value_offsets = UInt32BufferBuilder::new(info_iter.size_hint().0); + + // Make one builder per child of union array. Will combine after. + let mut string_values = StringBuilder::new(); + let mut bool_values = BooleanBuilder::new(); + let mut int64_values = Int64Builder::new(); + let mut int32_bitmasks = Int32Builder::new(); + let mut string_lists = ListBuilder::new(StringBuilder::new()); + let mut int32_to_int32_list_maps = MapBuilder::new( + None, + Int32Builder::new(), + ListBuilder::new(Int32Builder::new()), + ); + + for (code, info) in info_iter { + codes.append_value(code); + + match info { + InfoData::StringValue(val) => { + string_values.append_value(val); + type_id.append(0); + let value_offset = string_values.len() - 1; + value_offsets.append( + value_offset + .try_into() + .expect("Array has more values than can be indexed by u32"), + ); + } + _ => { + todo!("support other types in info_data") + } + }; + } + + let arrays: Vec<ArrayRef> = vec![ + Arc::new(string_values.finish()), + Arc::new(bool_values.finish()), + Arc::new(int64_values.finish()), + Arc::new(int32_bitmasks.finish()), + Arc::new(string_lists.finish()), + Arc::new(int32_to_int32_list_maps.finish()), + ]; + let info_schema = info_schema(); + let union_fields = { + match info_schema.field(1).data_type() { + DataType::Union(fields, _, _) => fields, + _ => unreachable!(), + } + }; + let children = union_fields + .iter() + .map(|f| f.to_owned()) + .zip(arrays.into_iter()) + .collect(); + let info_value = UnionArray::try_new( + &[0, 1, 2, 3, 4, 5], + type_id.finish(), + Some(value_offsets.finish()), + children, + ) + .expect("Info value array is always valid."); + + let batch: RecordBatch = RecordBatch::try_new( + Arc::new(info_schema), + vec![Arc::new(codes.finish()), Arc::new(info_value)], + ) + .expect("Info data batch is always valid."); + + let schema = batch.schema(); + Box::new(RecordBatchIterator::new( + std::iter::once(batch).map(Ok), + schema, + )) +} + +pub fn import_info_data( + reader: Box<dyn RecordBatchReader>, +) -> Result<Vec<(u32, InfoData)>, ArrowError> { + let batches = reader.collect::<Result<Vec<RecordBatch>, ArrowError>>()?; + + Ok(batches + .iter() + .flat_map(|batch| { Review Comment: That's an issue, but also the lifetimes of the arrays are a bit messy. I tried this out, but couldn't find an easy solution. I don't think it's worth optimizing much since it's going to be handling very few values. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
