alamb commented on code in PR #8402: URL: https://github.com/apache/arrow-rs/pull/8402#discussion_r2370089872
########## arrow-avro/src/lib.rs: ########## @@ -15,9 +15,165 @@ // specific language governing permissions and limitations // under the License. -//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro]. //! -//! [Apache Arrow]: https://arrow.apache.org +//! This crate provides: +//! - a [`reader`] that decodes Avro (Object Container Files, Avro Single‑Object encoding, +//! and Confluent Schema Registry wire format) into Arrow `RecordBatch`es, +//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or raw Avro binary). +//! +//! If you’re new to Arrow or Avro, see: +//! - Arrow project site: <https://arrow.apache.org/> +//! - Avro 1.11.1 specification: <https://avro.apache.org/docs/1.11.1/specification/> +//! +//! ## Quickstart: OCF (Object Container File) round‑trip *(runnable)* +//! +//! The example below creates an Arrow table, writes an **Avro OCF** fully in memory, +//! and then reads it back. OCF is a self‑describing file format that embeds the Avro +//! schema in a header with optional compression and block sync markers. +//! Spec: <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! +//! ``` +//! use std::io::Cursor; +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int32Array, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; +//! use arrow_avro::writer::AvroWriter; +//! use arrow_avro::reader::ReaderBuilder; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Build a tiny Arrow batch +//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); +//! let batch = RecordBatch::try_new( +//! Arc::new(schema.clone()), +//! vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef], +//! )?; +//! +//! // Write an Avro **Object Container File** (OCF) to a Vec<u8> +//! let sink: Vec<u8> = Vec::new(); +//! let mut w = AvroWriter::new(sink, schema.clone())?; +//! w.write(&batch)?; +//! w.finish()?; +//! let bytes = w.into_inner(); +//! assert!(!bytes.is_empty()); +//! +//! // Read it back +//! let mut r = ReaderBuilder::new().build(Cursor::new(bytes))?; +//! let out = r.next().unwrap()?; +//! assert_eq!(out.num_rows(), 3); +//! # Ok(()) } +//! ``` +//! +//! ## Quickstart: Confluent wire‑format round‑trip *(runnable)* +//! +//! The **Confluent Schema Registry wire format** prefixes each Avro message with a +//! 1‑byte magic `0x00` and a **4‑byte big‑endian** schema ID, followed by the Avro body. +//! See: <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format> +//! +//! In this round‑trip, we: +//! 1) Use `AvroStreamWriter` to create a **raw Avro body** for a single‑row batch, +//! 2) Wrap it with the Confluent prefix (magic and schema ID), +//! 3) Decode it back to Arrow using a `Decoder` configured with a `SchemaStore` that +//! maps the schema ID to the Avro schema used by the writer. +//! +//! ```ignore Review Comment: can we enable this one now that we have merged - https://github.com/apache/arrow-rs/pull/8371 ########## arrow-avro/src/reader/mod.rs: ########## @@ -128,47 +136,311 @@ //! } //! ``` //! -//! ### Building a `Decoder` for **single‑object encoding** (Rabin fingerprints) +//! ### Building and using a `Decoder` for **single‑object encoding** (Rabin fingerprints) +//! +//! The doctest below constructs a single‑object frame for a simple writer schema +//! (`{"type":"record","name":"User","fields":[{"name":"id","type":"long"}]}`) and then +//! decodes it into a `RecordBatch`. The Avro body encodes the long value `42` using +//! standard zig‑zag varint encoding. //! -//! ```no_run -//! use arrow_avro::schema::{AvroSchema, SchemaStore}; +//! ``` +//! use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, SINGLE_OBJECT_MAGIC}; //! use arrow_avro::reader::ReaderBuilder; //! -//! // Build a SchemaStore and register known writer schemas -//! let mut store = SchemaStore::new(); // Rabin by default -//! let user_schema = AvroSchema::new(r#"{"type":"record","name":"User","fields":[ -//! {"name":"id","type":"long"},{"name":"name","type":"string"}]}"#.to_string()); -//! let _fp = store.register(user_schema).unwrap(); // computes Rabin CRC-64-AVRO -//! -//! // Build a Decoder that expects single-object encoding (0xC3 0x01 + fingerprint and body) -//! let decoder = ReaderBuilder::new() -//! .with_writer_schema_store(store) -//! .with_batch_size(1024) -//! .build_decoder() -//! .unwrap(); -//! // Feed decoder with framed bytes (not shown; see `decode_stream` above). +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Register the writer schema (Rabin fingerprint by default). +//! let mut store = SchemaStore::new(); +//! let avro_schema = AvroSchema::new(r#"{"type":"record","name":"User","fields":[ +//! {"name":"id","type":"long"}]}"#.to_string()); +//! let fp = store.register(avro_schema)?; +//! +//! // Minimal Avro long encoder (zig-zag + varint). Review Comment: can we use the Avro writer here too? If the idea is to show how to use the decoder, maybe we could just hide the code that creates the input `body` (aka prefix all the lines related to creating the file with `# `) The same thing applies to the examples below too ########## arrow-avro/src/lib.rs: ########## @@ -15,9 +15,165 @@ // specific language governing permissions and limitations // under the License. -//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro]. //! -//! [Apache Arrow]: https://arrow.apache.org +//! This crate provides: +//! - a [`reader`] that decodes Avro (Object Container Files, Avro Single‑Object encoding, +//! and Confluent Schema Registry wire format) into Arrow `RecordBatch`es, +//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or raw Avro binary). +//! +//! If you’re new to Arrow or Avro, see: +//! - Arrow project site: <https://arrow.apache.org/> +//! - Avro 1.11.1 specification: <https://avro.apache.org/docs/1.11.1/specification/> +//! +//! ## Quickstart: OCF (Object Container File) round‑trip *(runnable)* Review Comment: What does the "runnable" mean here? Another annotation you might want to use is `Example` as a prefix: ```suggestion //! ## Example: OCF (Object Container File) round‑trip ``` ########## arrow-avro/src/writer/mod.rs: ########## @@ -19,19 +19,115 @@ //! //! # Overview //! -//! * Use **`AvroWriter`** (Object Container File) when you want a -//! self‑contained Avro file with header, schema JSON, optional compression, -//! blocks, and sync markers. -//! * Use **`AvroStreamWriter`** (raw binary stream) when you already know the -//! schema out‑of‑band (i.e., via a schema registry) and need a stream -//! of Avro‑encoded records with minimal framing. +//! Use this module to serialize Arrow `RecordBatch` values into Avro. Two output +//! formats are supported: //! - -/// Encodes `RecordBatch` into the Avro binary format. -pub mod encoder; -/// Logic for different Avro container file formats. -pub mod format; - +//! * **`AvroWriter`** — writes an **Object Container File (OCF)**: a self‑describing +//! file with header (schema JSON + metadata), optional compression, data blocks, and +//! sync markers. See Avro 1.11.1 “Object Container Files.” +//! <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! * **`AvroStreamWriter`** — writes a **raw Avro binary stream** (“datum” bytes) without +//! any container framing. This is useful when the schema is known out‑of‑band (i.e., +//! via a registry) and you want minimal overhead. +//! +//! ## Which format should I use? +//! +//! * Use **OCF** when you need a portable, self‑contained file. The schema travels with +//! the data, making it easy to read elsewhere. +//! * Use the **raw stream** when your surrounding protocol supplies schema information +//! (i.e., a schema registry). If you need **single‑object encoding (SOE)** or Confluent +//! **Schema Registry** framing, you must add the appropriate prefix *outside* this writer: +//! - **SOE**: `0xC3 0x01` + 8‑byte little‑endian CRC‑64‑AVRO fingerprint + Avro body +//! (see Avro 1.11.1 “Single object encoding”). +//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding> +//! - **Confluent wire format**: magic `0x00` + **big‑endian** 4‑byte schema ID and Avro body. +//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format> +//! +//! ## Quickstart: Write an OCF in memory (runnable) Review Comment: I personally suggest moving this example to the `AvroWriter` struct itself as I think it will be easier to find ########## arrow-avro/src/writer/mod.rs: ########## @@ -19,19 +19,115 @@ //! //! # Overview //! -//! * Use **`AvroWriter`** (Object Container File) when you want a -//! self‑contained Avro file with header, schema JSON, optional compression, -//! blocks, and sync markers. -//! * Use **`AvroStreamWriter`** (raw binary stream) when you already know the -//! schema out‑of‑band (i.e., via a schema registry) and need a stream -//! of Avro‑encoded records with minimal framing. +//! Use this module to serialize Arrow `RecordBatch` values into Avro. Two output +//! formats are supported: //! - -/// Encodes `RecordBatch` into the Avro binary format. -pub mod encoder; -/// Logic for different Avro container file formats. -pub mod format; - +//! * **`AvroWriter`** — writes an **Object Container File (OCF)**: a self‑describing +//! file with header (schema JSON + metadata), optional compression, data blocks, and +//! sync markers. See Avro 1.11.1 “Object Container Files.” +//! <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! * **`AvroStreamWriter`** — writes a **raw Avro binary stream** (“datum” bytes) without +//! any container framing. This is useful when the schema is known out‑of‑band (i.e., +//! via a registry) and you want minimal overhead. +//! +//! ## Which format should I use? +//! +//! * Use **OCF** when you need a portable, self‑contained file. The schema travels with +//! the data, making it easy to read elsewhere. +//! * Use the **raw stream** when your surrounding protocol supplies schema information +//! (i.e., a schema registry). If you need **single‑object encoding (SOE)** or Confluent +//! **Schema Registry** framing, you must add the appropriate prefix *outside* this writer: +//! - **SOE**: `0xC3 0x01` + 8‑byte little‑endian CRC‑64‑AVRO fingerprint + Avro body +//! (see Avro 1.11.1 “Single object encoding”). +//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding> +//! - **Confluent wire format**: magic `0x00` + **big‑endian** 4‑byte schema ID and Avro body. +//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format> +//! +//! ## Quickstart: Write an OCF in memory (runnable) +//! +//! ``` +//! use std::io::Cursor; +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int64Array, StringArray, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; +//! use arrow_avro::writer::AvroWriter; +//! use arrow_avro::reader::ReaderBuilder; +//! use arrow_avro::schema::AvroSchema; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Writer schema: { id: long, name: string } +//! let writer_schema = Schema::new(vec![ +//! Field::new("id", DataType::Int64, false), +//! Field::new("name", DataType::Utf8, false), +//! ]); +//! let batch = RecordBatch::try_new( +//! Arc::new(writer_schema.clone()), +//! vec![ +//! Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef, +//! Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef, +//! ], +//! )?; +//! +//! // Write an Avro **Object Container File** (OCF) to memory +//! let mut w = AvroWriter::new(Vec::<u8>::new(), writer_schema.clone())?; +//! w.write(&batch)?; +//! w.finish()?; +//! let bytes = w.into_inner(); +//! +//! // Build a Reader with the explicit reader schema +//! let mut r = ReaderBuilder::new() +//! .build(Cursor::new(bytes))?; +//! +//! // Decode one batch and assert row count +//! let out = r.next().unwrap()?; +//! assert_eq!(out.num_rows(), 2); +//! # Ok(()) } +//! ``` +//! +//! ## Quickstart: Write a raw Avro binary stream (runnable) +//! +//! This writes only the **Avro body** bytes - no OCF header/sync, no SOE/confluent prefix. +//! If you plan to interoperate with a schema registry, add the appropriate prefix yourself +//! (see links above). +//! +//! ```ignore Review Comment: this can run too? ########## arrow-avro/src/lib.rs: ########## @@ -15,9 +15,165 @@ // specific language governing permissions and limitations // under the License. -//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro]. //! -//! [Apache Arrow]: https://arrow.apache.org +//! This crate provides: +//! - a [`reader`] that decodes Avro (Object Container Files, Avro Single‑Object encoding, +//! and Confluent Schema Registry wire format) into Arrow `RecordBatch`es, +//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or raw Avro binary). +//! +//! If you’re new to Arrow or Avro, see: +//! - Arrow project site: <https://arrow.apache.org/> +//! - Avro 1.11.1 specification: <https://avro.apache.org/docs/1.11.1/specification/> +//! +//! ## Quickstart: OCF (Object Container File) round‑trip *(runnable)* +//! +//! The example below creates an Arrow table, writes an **Avro OCF** fully in memory, +//! and then reads it back. OCF is a self‑describing file format that embeds the Avro +//! schema in a header with optional compression and block sync markers. +//! Spec: <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! +//! ``` +//! use std::io::Cursor; +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int32Array, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; +//! use arrow_avro::writer::AvroWriter; +//! use arrow_avro::reader::ReaderBuilder; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Build a tiny Arrow batch +//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); +//! let batch = RecordBatch::try_new( +//! Arc::new(schema.clone()), +//! vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef], +//! )?; +//! +//! // Write an Avro **Object Container File** (OCF) to a Vec<u8> +//! let sink: Vec<u8> = Vec::new(); +//! let mut w = AvroWriter::new(sink, schema.clone())?; +//! w.write(&batch)?; +//! w.finish()?; +//! let bytes = w.into_inner(); +//! assert!(!bytes.is_empty()); +//! +//! // Read it back +//! let mut r = ReaderBuilder::new().build(Cursor::new(bytes))?; +//! let out = r.next().unwrap()?; +//! assert_eq!(out.num_rows(), 3); +//! # Ok(()) } +//! ``` +//! +//! ## Quickstart: Confluent wire‑format round‑trip *(runnable)* +//! +//! The **Confluent Schema Registry wire format** prefixes each Avro message with a +//! 1‑byte magic `0x00` and a **4‑byte big‑endian** schema ID, followed by the Avro body. +//! See: <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format> +//! +//! In this round‑trip, we: +//! 1) Use `AvroStreamWriter` to create a **raw Avro body** for a single‑row batch, +//! 2) Wrap it with the Confluent prefix (magic and schema ID), +//! 3) Decode it back to Arrow using a `Decoder` configured with a `SchemaStore` that +//! maps the schema ID to the Avro schema used by the writer. +//! +//! ```ignore +//! use arrow_avro::reader::ReaderBuilder; +//! use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm, CONFLUENT_MAGIC}; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Writer schema registered under Schema Registry ID 1 +//! let avro_json = r#"{ +//! "type":"record","name":"User", +//! "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"}] +//! }"#; +//! +//! let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::None); +//! let id: u32 = 1; +//! store.set(Fingerprint::Id(id), AvroSchema::new(avro_json.to_string()))?; +//! +//! // Minimal Avro body encoder for {id: long, name: string} +//! fn enc_long(v: i64, out: &mut Vec<u8>) { +//! let mut n = ((v << 1) ^ (v >> 63)) as u64; +//! while (n & !0x7F) != 0 { out.push(((n as u8) & 0x7F) | 0x80); n >>= 7; } +//! out.push(n as u8); +//! } +//! fn enc_len(l: usize, out: &mut Vec<u8>) { enc_long(l as i64, out); } Review Comment: This example doesn't' actually use an AvroStreamWriter but the text says it does. Can we update it to use the writer? ########## arrow-avro/src/lib.rs: ########## @@ -15,9 +15,165 @@ // specific language governing permissions and limitations // under the License. -//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro]. //! -//! [Apache Arrow]: https://arrow.apache.org +//! This crate provides: +//! - a [`reader`] that decodes Avro (Object Container Files, Avro Single‑Object encoding, +//! and Confluent Schema Registry wire format) into Arrow `RecordBatch`es, +//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or raw Avro binary). +//! +//! If you’re new to Arrow or Avro, see: +//! - Arrow project site: <https://arrow.apache.org/> +//! - Avro 1.11.1 specification: <https://avro.apache.org/docs/1.11.1/specification/> +//! +//! ## Quickstart: OCF (Object Container File) round‑trip *(runnable)* +//! +//! The example below creates an Arrow table, writes an **Avro OCF** fully in memory, +//! and then reads it back. OCF is a self‑describing file format that embeds the Avro +//! schema in a header with optional compression and block sync markers. +//! Spec: <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! +//! ``` +//! use std::io::Cursor; +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int32Array, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; +//! use arrow_avro::writer::AvroWriter; +//! use arrow_avro::reader::ReaderBuilder; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Build a tiny Arrow batch +//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); +//! let batch = RecordBatch::try_new( +//! Arc::new(schema.clone()), +//! vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef], +//! )?; +//! +//! // Write an Avro **Object Container File** (OCF) to a Vec<u8> +//! let sink: Vec<u8> = Vec::new(); +//! let mut w = AvroWriter::new(sink, schema.clone())?; +//! w.write(&batch)?; +//! w.finish()?; +//! let bytes = w.into_inner(); +//! assert!(!bytes.is_empty()); +//! +//! // Read it back +//! let mut r = ReaderBuilder::new().build(Cursor::new(bytes))?; +//! let out = r.next().unwrap()?; +//! assert_eq!(out.num_rows(), 3); +//! # Ok(()) } +//! ``` +//! +//! ## Quickstart: Confluent wire‑format round‑trip *(runnable)* +//! +//! The **Confluent Schema Registry wire format** prefixes each Avro message with a +//! 1‑byte magic `0x00` and a **4‑byte big‑endian** schema ID, followed by the Avro body. +//! See: <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format> +//! +//! In this round‑trip, we: +//! 1) Use `AvroStreamWriter` to create a **raw Avro body** for a single‑row batch, +//! 2) Wrap it with the Confluent prefix (magic and schema ID), +//! 3) Decode it back to Arrow using a `Decoder` configured with a `SchemaStore` that +//! maps the schema ID to the Avro schema used by the writer. +//! +//! ```ignore +//! use arrow_avro::reader::ReaderBuilder; +//! use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, FingerprintAlgorithm, CONFLUENT_MAGIC}; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Writer schema registered under Schema Registry ID 1 +//! let avro_json = r#"{ +//! "type":"record","name":"User", +//! "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"}] +//! }"#; +//! +//! let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::None); +//! let id: u32 = 1; +//! store.set(Fingerprint::Id(id), AvroSchema::new(avro_json.to_string()))?; +//! +//! // Minimal Avro body encoder for {id: long, name: string} +//! fn enc_long(v: i64, out: &mut Vec<u8>) { +//! let mut n = ((v << 1) ^ (v >> 63)) as u64; +//! while (n & !0x7F) != 0 { out.push(((n as u8) & 0x7F) | 0x80); n >>= 7; } +//! out.push(n as u8); +//! } +//! fn enc_len(l: usize, out: &mut Vec<u8>) { enc_long(l as i64, out); } +//! fn enc_str(s: &str, out: &mut Vec<u8>) { enc_len(s.len(), out); out.extend_from_slice(s.as_bytes()); } +//! +//! // Encode one record { id: 42, name: "alice" } +//! let mut body = Vec::new(); enc_long(42, &mut body); enc_str("alice", &mut body); +//! +//! // Confluent frame: 0x00 + 4‑byte **big‑endian** ID + Avro body +//! let mut frame = Vec::new(); +//! frame.extend_from_slice(&CONFLUENT_MAGIC); +//! frame.extend_from_slice(&id.to_be_bytes()); +//! frame.extend_from_slice(&body); +//! +//! // Decode +//! let mut dec = ReaderBuilder::new() +//! .with_writer_schema_store(store) +//! .build_decoder()?; +//! dec.decode(&frame)?; +//! let out = dec.flush()?.expect("one row"); +//! assert_eq!(out.num_rows(), 1); +//! # Ok(()) } +//! ``` +//! +//! ## Quickstart: Avro Single‑Object Encoding round‑trip *(runnable)* +//! +//! Avro **Single‑Object Encoding (SOE)** wraps an Avro body with a 2‑byte marker +//! `0xC3 0x01` and an **8‑byte little‑endian CRC‑64‑AVRO Rabin fingerprint** of the +//! writer schema, then the Avro body. Spec: +//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding> +//! +//! This example registers the writer schema (computing a Rabin fingerprint), writes a +//! single‑row Avro body, constructs the SOE frame, and decodes it back to Arrow. +//! +//! ```ignore Review Comment: likewise here, this should now work, right and we can use the Avro writer? ########## arrow-avro/src/writer/mod.rs: ########## @@ -19,19 +19,115 @@ //! //! # Overview //! -//! * Use **`AvroWriter`** (Object Container File) when you want a -//! self‑contained Avro file with header, schema JSON, optional compression, -//! blocks, and sync markers. -//! * Use **`AvroStreamWriter`** (raw binary stream) when you already know the -//! schema out‑of‑band (i.e., via a schema registry) and need a stream -//! of Avro‑encoded records with minimal framing. +//! Use this module to serialize Arrow `RecordBatch` values into Avro. Two output +//! formats are supported: //! - -/// Encodes `RecordBatch` into the Avro binary format. -pub mod encoder; -/// Logic for different Avro container file formats. -pub mod format; - +//! * **`AvroWriter`** — writes an **Object Container File (OCF)**: a self‑describing +//! file with header (schema JSON + metadata), optional compression, data blocks, and +//! sync markers. See Avro 1.11.1 “Object Container Files.” +//! <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! * **`AvroStreamWriter`** — writes a **raw Avro binary stream** (“datum” bytes) without Review Comment: ```suggestion //! * **[`AvroStreamWriter`]** — writes a **raw Avro binary stream** (“datum” bytes) without ``` ########## arrow-avro/src/writer/mod.rs: ########## @@ -19,19 +19,115 @@ //! //! # Overview //! -//! * Use **`AvroWriter`** (Object Container File) when you want a -//! self‑contained Avro file with header, schema JSON, optional compression, -//! blocks, and sync markers. -//! * Use **`AvroStreamWriter`** (raw binary stream) when you already know the -//! schema out‑of‑band (i.e., via a schema registry) and need a stream -//! of Avro‑encoded records with minimal framing. +//! Use this module to serialize Arrow `RecordBatch` values into Avro. Two output +//! formats are supported: //! - -/// Encodes `RecordBatch` into the Avro binary format. -pub mod encoder; -/// Logic for different Avro container file formats. -pub mod format; - +//! * **`AvroWriter`** — writes an **Object Container File (OCF)**: a self‑describing +//! file with header (schema JSON + metadata), optional compression, data blocks, and +//! sync markers. See Avro 1.11.1 “Object Container Files.” +//! <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> +//! * **`AvroStreamWriter`** — writes a **raw Avro binary stream** (“datum” bytes) without +//! any container framing. This is useful when the schema is known out‑of‑band (i.e., +//! via a registry) and you want minimal overhead. +//! +//! ## Which format should I use? +//! +//! * Use **OCF** when you need a portable, self‑contained file. The schema travels with +//! the data, making it easy to read elsewhere. +//! * Use the **raw stream** when your surrounding protocol supplies schema information +//! (i.e., a schema registry). If you need **single‑object encoding (SOE)** or Confluent +//! **Schema Registry** framing, you must add the appropriate prefix *outside* this writer: +//! - **SOE**: `0xC3 0x01` + 8‑byte little‑endian CRC‑64‑AVRO fingerprint + Avro body +//! (see Avro 1.11.1 “Single object encoding”). +//! <https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding> +//! - **Confluent wire format**: magic `0x00` + **big‑endian** 4‑byte schema ID and Avro body. +//! <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format> +//! +//! ## Quickstart: Write an OCF in memory (runnable) +//! +//! ``` +//! use std::io::Cursor; +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int64Array, StringArray, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; +//! use arrow_avro::writer::AvroWriter; +//! use arrow_avro::reader::ReaderBuilder; +//! use arrow_avro::schema::AvroSchema; +//! +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Writer schema: { id: long, name: string } +//! let writer_schema = Schema::new(vec![ +//! Field::new("id", DataType::Int64, false), +//! Field::new("name", DataType::Utf8, false), +//! ]); +//! let batch = RecordBatch::try_new( +//! Arc::new(writer_schema.clone()), +//! vec![ +//! Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef, +//! Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef, +//! ], +//! )?; +//! +//! // Write an Avro **Object Container File** (OCF) to memory +//! let mut w = AvroWriter::new(Vec::<u8>::new(), writer_schema.clone())?; +//! w.write(&batch)?; +//! w.finish()?; +//! let bytes = w.into_inner(); +//! +//! // Build a Reader with the explicit reader schema +//! let mut r = ReaderBuilder::new() +//! .build(Cursor::new(bytes))?; +//! +//! // Decode one batch and assert row count +//! let out = r.next().unwrap()?; +//! assert_eq!(out.num_rows(), 2); +//! # Ok(()) } +//! ``` +//! +//! ## Quickstart: Write a raw Avro binary stream (runnable) +//! +//! This writes only the **Avro body** bytes - no OCF header/sync, no SOE/confluent prefix. +//! If you plan to interoperate with a schema registry, add the appropriate prefix yourself +//! (see links above). +//! +//! ```ignore +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int64Array, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; +//! use arrow_avro::writer::AvroStreamWriter; Review Comment: likewise, i suggest moving this to the docs on `AvroStreamWriter` as I think it will be easier to find ########## arrow-avro/src/writer/mod.rs: ########## @@ -19,19 +19,115 @@ //! //! # Overview //! -//! * Use **`AvroWriter`** (Object Container File) when you want a -//! self‑contained Avro file with header, schema JSON, optional compression, -//! blocks, and sync markers. -//! * Use **`AvroStreamWriter`** (raw binary stream) when you already know the -//! schema out‑of‑band (i.e., via a schema registry) and need a stream -//! of Avro‑encoded records with minimal framing. +//! Use this module to serialize Arrow `RecordBatch` values into Avro. Two output +//! formats are supported: //! - -/// Encodes `RecordBatch` into the Avro binary format. -pub mod encoder; -/// Logic for different Avro container file formats. -pub mod format; - +//! * **`AvroWriter`** — writes an **Object Container File (OCF)**: a self‑describing Review Comment: If you make this a link, then it might be easier to find the structures if we also move the examples ```suggestion //! * **[`AvroWriter`]** — writes an **Object Container File (OCF)**: a self‑describing ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org