jecsand838 commented on code in PR #8006: URL: https://github.com/apache/arrow-rs/pull/8006#discussion_r2244100996
########## arrow-avro/src/reader/mod.rs: ########## @@ -272,17 +450,70 @@ impl ReaderBuilder { self } - /// Sets the Avro schema. + /// Sets the Avro reader schema. /// /// If a schema is not provided, the schema will be read from the Avro file header. - pub fn with_schema(mut self, schema: AvroSchema<'static>) -> Self { - self.schema = Some(schema); + pub fn with_reader_schema(mut self, reader_schema: AvroSchema<'static>) -> Self { + self.reader_schema = Some(reader_schema); self } + /// Sets the `SchemaStore` used for resolving writer schemas. + /// + /// This is necessary when decoding single-object encoded data that identifies + /// schemas by a fingerprint. The store allows the decoder to look up the + /// full writer schema from a fingerprint embedded in the data. + /// + /// Defaults to `None`. + pub fn with_writer_schema_store(mut self, store: SchemaStore<'static>) -> Self { + self.writer_schema_store = Some(store); + self + } + + /// Sets the initial schema fingerprint for decoding single-object encoded data. + /// + /// This is useful when the data stream does not begin with a schema definition + /// or fingerprint, allowing the decoder to start with a known schema from the + /// `SchemaStore`. + /// + /// Defaults to `None`. + pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self { + self.active_fingerprint = Some(fp); + self + } + + /// Set the maximum number of decoders to cache. + /// + /// When dealing with Avro files that contain multiple schemas, we may need to switch + /// between different decoders. This cache avoids rebuilding them from scratch every time. + /// + /// Defaults to `20`. + pub fn with_max_decoder_cache_size(mut self, n: usize) -> Self { + self.decoder_cache_size = n; + self + } + + fn validate(&self) -> Result<(), ArrowError> { + match ( + self.writer_schema_store.as_ref(), + self.reader_schema.as_ref(), + self.active_fingerprint.as_ref(), + ) { + (Some(_), None, _) => Err(ArrowError::ParseError( + "Reader schema must be set when writer schema store is provided".into(), + )), + (None, _, Some(_)) => Err(ArrowError::ParseError( + "Active fingerprint requires a writer schema store".into(), + )), + _ => Ok(()), + } + } + /// Create a [`Reader`] from this builder and a `BufRead` pub fn build<R: BufRead>(self, mut reader: R) -> Result<Reader<R>, ArrowError> { - let (header, decoder) = self.build_impl(&mut reader)?; + self.validate()?; + let header = read_header(&mut reader)?; + let decoder = self.make_decoder(Some(&header))?; Ok(Reader { reader, header, Review Comment: Actually I should have double checked this before blinding accepting, but due to the benefits of keeping the `Header` loosely coupled with the `Decoder` it's probably best to leave this alone for now imo. I can follow up on this one though. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org