jecsand838 commented on code in PR #8006: URL: https://github.com/apache/arrow-rs/pull/8006#discussion_r2249126660
########## arrow-avro/src/reader/mod.rs: ########## @@ -272,17 +484,92 @@ impl ReaderBuilder { self } - /// Sets the Avro schema. + /// Sets the Avro reader schema. /// /// If a schema is not provided, the schema will be read from the Avro file header. - pub fn with_schema(mut self, schema: AvroSchema<'static>) -> Self { - self.schema = Some(schema); + pub fn with_reader_schema(mut self, reader_schema: AvroSchema<'static>) -> Self { + self.reader_schema = Some(reader_schema); self } + /// Sets the `SchemaStore` used for resolving writer schemas. + /// + /// This is necessary when decoding single-object encoded data that identifies + /// schemas by a fingerprint. The store allows the decoder to look up the + /// full writer schema from a fingerprint embedded in the data. + /// + /// Defaults to `None`. + pub fn with_writer_schema_store(mut self, store: SchemaStore<'static>) -> Self { + self.writer_schema_store = Some(store); + self + } + + /// Sets the initial schema fingerprint for decoding single-object encoded data. + /// + /// This is useful when the data stream does not begin with a schema definition + /// or fingerprint, allowing the decoder to start with a known schema from the + /// `SchemaStore`. + /// + /// Defaults to `None`. + pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self { + self.active_fingerprint = Some(fp); + self + } + + /// Set the maximum number of decoders to cache. + /// + /// When dealing with Avro files that contain multiple schemas, we may need to switch + /// between different decoders. This cache avoids rebuilding them from scratch every time. + /// + /// Defaults to `20`. + pub fn with_max_decoder_cache_size(mut self, n: usize) -> Self { + self.decoder_cache_size = n; + self + } + + // Validate the builder configuration against this truth‑table + // + // | writer_schema_store | reader_schema | active_fingerprint | Result | + // |---------------------|---------------|--------------------|--------| + // | None----------------| None----------| None---------------| Err----| + // | None----------------| None----------| Some---------------| Err----| + // | None----------------| Some----------| None---------------| Ok-----| + // | None----------------| Some----------| Some---------------| Err----| + // | Some----------------| None----------| None---------------| Err----| + // | Some----------------| None----------| Some---------------| Err----| + // | Some----------------| Some----------| None---------------| Ok-----| + // | Some----------------| Some----------| Some---------------| Ok-----| + fn validate(&self) -> Result<(), ArrowError> { + match ( + self.writer_schema_store.is_some(), + self.reader_schema.is_some(), + self.active_fingerprint.is_some(), + ) { + // Row 3: No store, reader schema present, no fingerprint + (false, true, false) + // Row 7: Store is present, reader schema is resent, no fingerprint + | (true, true, false) + // Row 8: Store present, reader schema present, fingerprint present + | (true, true, true) => Ok(()), + // Fingerprint without a store (rows 2 & 4) + (false, _, true) => Err(ArrowError::InvalidArgumentError( + "Active fingerprint requires a writer schema store".into(), + )), + // Store present but no reader schema (rows 5 & 6) + (true, false, _) => Err(ArrowError::InvalidArgumentError( + "Reader schema must be set when writer schema store is provided".into(), + )), + // No schema store or reader schema provided (row 1) + (false, false, _) => Err(ArrowError::InvalidArgumentError( Review Comment: @scovich So you were correct here. I was able to complete remove the `validate` method and ended up simplifying a lot of the logic. For instance I managed to slim down `make_decoder` to this: ```rust fn make_decoder(&self, header: Option<&Header>) -> Result<Decoder, ArrowError> { if let Some(hdr) = header { let writer_schema = hdr .schema() .map_err(|e| ArrowError::ExternalError(Box::new(e)))? .ok_or_else(|| { ArrowError::ParseError("No Avro schema present in file header".into()) })?; let record_decoder = self.make_record_decoder(&writer_schema, self.reader_schema.as_ref())?; return Ok(self.make_decoder_with_parts(record_decoder, None, None, None)); } let writer_schema_store = self.writer_schema_store.as_ref().ok_or_else(|| { ArrowError::ParseError("Writer schema store required for raw Avro".into()) })?; let fingerprint = self .active_fingerprint .or_else(|| writer_schema_store.fingerprints().into_iter().next()) .ok_or_else(|| { ArrowError::ParseError( "Writer schema store must contain at least one schema".into(), ) })?; let writer_schema = writer_schema_store.lookup(&fingerprint).ok_or_else(|| { ArrowError::ParseError("Active fingerprint not found in schema store".into()) })?; let record_decoder = self.make_record_decoder(writer_schema, self.reader_schema.as_ref())?; Ok(self.make_decoder_with_parts( record_decoder, Some(fingerprint), self.reader_schema.clone(), self.writer_schema_store.clone(), )) } ``` We ended up at a much better place. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org