alamb commented on code in PR #7434:
URL: https://github.com/apache/arrow-rs/pull/7434#discussion_r2096423625
##########
arrow-avro/src/lib.rs:
##########
@@ -36,19 +36,37 @@ pub mod reader;
/// Avro schema parsing and representation
///
/// Provides types for parsing and representing Avro schema definitions.
-mod schema;
+pub mod schema;
Review Comment:
is it needed to make these module public?
##########
arrow-avro/src/reader/record.rs:
##########
@@ -34,15 +34,32 @@ use std::sync::Arc;
pub struct RecordDecoder {
schema: SchemaRef,
fields: Vec<Decoder>,
+ use_utf8view: bool,
}
impl RecordDecoder {
/// Create a new [`RecordDecoder`] from the provided [`AvroDataType`]
pub fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> {
+ Self::try_new_with_options(data_type, false)
+ }
+
+ /// Create a new [`RecordDecoder`] from the provided [`AvroDataType`] with
additional options
+ ///
+ /// This method allows you to customize how the Avro data is decoded into
Arrow arrays.
+ /// In particular, it allows enabling Utf8View support for better string
performance.
+ ///
+ /// # Parameters
+ /// * `data_type` - The Avro data type to decode
+ /// * `use_utf8view` - If true, use StringViewArray instead of StringArray
for string data
+ pub fn try_new_with_options(
+ data_type: &AvroDataType,
+ use_utf8view: bool,
Review Comment:
Shall we use `ReadOptions` here rather than the single field? That would
make updating these easier in the future
##########
arrow-avro/src/reader/mod.rs:
##########
@@ -22,14 +22,42 @@ use crate::reader::header::{Header, HeaderDecoder};
use arrow_schema::ArrowError;
use std::io::BufRead;
-mod header;
-
mod block;
-
mod cursor;
+mod header;
mod record;
mod vlq;
+/// Configuration options for reading Avro data into Arrow arrays
+///
+/// This struct contains configuration options that control how Avro data is
+/// converted into Arrow arrays. It allows customizing various aspects of the
+/// data conversion process.
+///
+/// # Examples
+///
+/// ```
+/// # use arrow_avro::reader::ReadOptions;
+/// // Use default options (regular StringArray for strings)
+/// let default_options = ReadOptions::default();
+///
+/// // Enable Utf8View support for better string performance
+/// let options = ReadOptions {
+/// use_utf8view: true,
+/// ..ReadOptions::default()
+/// };
+/// ```
+#[derive(Default)]
+pub struct ReadOptions {
+ /// If true, use StringViewArray instead of StringArray for string data
+ ///
+ /// When this option is enabled, string data from Avro files will be loaded
+ /// into Arrow's StringViewArray instead of the standard StringArray.
+ ///
+ /// Default: false
+ pub use_utf8view: bool,
Review Comment:
I think it would be nice if this field was not pub -- that way we could add
new options to `ReadOptions` in the future without it being a breaking API
change
##########
arrow-avro/src/codec.rs:
##########
@@ -364,3 +437,213 @@ fn make_data_type<'a>(
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::schema::{
+ Attributes, ComplexType, Fixed, PrimitiveType, Record, Schema, Type,
TypeName,
+ };
+ use serde_json;
+ use std::collections::HashMap;
+
+ fn create_schema_with_logical_type(
+ primitive_type: PrimitiveType,
+ logical_type: &'static str,
+ ) -> Schema<'static> {
+ let attributes = Attributes {
+ logical_type: Some(logical_type),
+ additional: Default::default(),
+ };
+
+ Schema::Type(Type {
+ r#type: TypeName::Primitive(primitive_type),
+ attributes,
+ })
+ }
+
+ fn create_fixed_schema(size: usize, logical_type: &'static str) ->
Schema<'static> {
+ let attributes = Attributes {
+ logical_type: Some(logical_type),
+ additional: Default::default(),
+ };
+
+ Schema::Complex(ComplexType::Fixed(Fixed {
+ name: "fixed_type",
+ namespace: None,
+ aliases: Vec::new(),
+ size,
+ attributes,
+ }))
+ }
+
+ #[test]
+ fn test_date_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Int,
"date");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::Date32));
+ }
+
+ #[test]
+ fn test_time_millis_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Int,
"time-millis");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::TimeMillis));
+ }
+
+ #[test]
+ fn test_time_micros_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Long,
"time-micros");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::TimeMicros));
+ }
+
+ #[test]
+ fn test_timestamp_millis_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Long,
"timestamp-millis");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::TimestampMillis(true)));
+ }
+
+ #[test]
+ fn test_timestamp_micros_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Long,
"timestamp-micros");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::TimestampMicros(true)));
+ }
+
+ #[test]
+ fn test_local_timestamp_millis_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Long,
"local-timestamp-millis");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::TimestampMillis(false)));
+ }
+
+ #[test]
+ fn test_local_timestamp_micros_logical_type() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Long,
"local-timestamp-micros");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert!(matches!(result.codec, Codec::TimestampMicros(false)));
+ }
+
+ #[test]
+ fn test_duration_logical_type() {
+ let mut codec = Codec::Fixed(12);
+
+ if let c @ Codec::Fixed(12) = &mut codec {
+ *c = Codec::Interval;
+ }
+
+ assert!(matches!(codec, Codec::Interval));
+ }
+
+ #[test]
+ fn test_decimal_logical_type_not_implemented() {
+ let mut codec = Codec::Fixed(16);
+
+ let process_decimal = || -> Result<(), ArrowError> {
+ if let Codec::Fixed(_) = codec {
+ return Err(ArrowError::NotYetImplemented(
+ "Decimals are not currently supported".to_string(),
+ ));
+ }
+ Ok(())
+ };
+
+ let result = process_decimal();
+
+ assert!(result.is_err());
+ if let Err(ArrowError::NotYetImplemented(msg)) = result {
+ assert!(msg.contains("Decimals are not currently supported"));
+ } else {
+ panic!("Expected NotYetImplemented error");
+ }
+ }
+
+ #[test]
+ fn test_unknown_logical_type_added_to_metadata() {
+ let schema = create_schema_with_logical_type(PrimitiveType::Int,
"custom-type");
+
+ let mut resolver = Resolver::default();
+ let result = make_data_type(&schema, None, &mut resolver,
false).unwrap();
+
+ assert_eq!(
+ result.metadata.get("logicalType"),
+ Some(&"custom-type".to_string())
+ );
+ }
+
+ #[test]
+ fn test_string_with_utf8view_enabled() {
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]