jecsand838 commented on code in PR #8006:
URL: https://github.com/apache/arrow-rs/pull/8006#discussion_r2249126660


##########
arrow-avro/src/reader/mod.rs:
##########
@@ -272,17 +484,92 @@ impl ReaderBuilder {
         self
     }
 
-    /// Sets the Avro schema.
+    /// Sets the Avro reader schema.
     ///
     /// If a schema is not provided, the schema will be read from the Avro 
file header.
-    pub fn with_schema(mut self, schema: AvroSchema<'static>) -> Self {
-        self.schema = Some(schema);
+    pub fn with_reader_schema(mut self, reader_schema: AvroSchema<'static>) -> 
Self {
+        self.reader_schema = Some(reader_schema);
         self
     }
 
+    /// Sets the `SchemaStore` used for resolving writer schemas.
+    ///
+    /// This is necessary when decoding single-object encoded data that 
identifies
+    /// schemas by a fingerprint. The store allows the decoder to look up the
+    /// full writer schema from a fingerprint embedded in the data.
+    ///
+    /// Defaults to `None`.
+    pub fn with_writer_schema_store(mut self, store: SchemaStore<'static>) -> 
Self {
+        self.writer_schema_store = Some(store);
+        self
+    }
+
+    /// Sets the initial schema fingerprint for decoding single-object encoded 
data.
+    ///
+    /// This is useful when the data stream does not begin with a schema 
definition
+    /// or fingerprint, allowing the decoder to start with a known schema from 
the
+    /// `SchemaStore`.
+    ///
+    /// Defaults to `None`.
+    pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self {
+        self.active_fingerprint = Some(fp);
+        self
+    }
+
+    /// Set the maximum number of decoders to cache.
+    ///
+    /// When dealing with Avro files that contain multiple schemas, we may 
need to switch
+    /// between different decoders. This cache avoids rebuilding them from 
scratch every time.
+    ///
+    /// Defaults to `20`.
+    pub fn with_max_decoder_cache_size(mut self, n: usize) -> Self {
+        self.decoder_cache_size = n;
+        self
+    }
+
+    // Validate the builder configuration against this truth‑table
+    //
+    // | writer_schema_store | reader_schema | active_fingerprint | Result |
+    // |---------------------|---------------|--------------------|--------|
+    // | None----------------| None----------| None---------------| Err----|
+    // | None----------------| None----------| Some---------------| Err----|
+    // | None----------------| Some----------| None---------------| Ok-----|
+    // | None----------------| Some----------| Some---------------| Err----|
+    // | Some----------------| None----------| None---------------| Err----|
+    // | Some----------------| None----------| Some---------------| Err----|
+    // | Some----------------| Some----------| None---------------| Ok-----|
+    // | Some----------------| Some----------| Some---------------| Ok-----|
+    fn validate(&self) -> Result<(), ArrowError> {
+        match (
+            self.writer_schema_store.is_some(),
+            self.reader_schema.is_some(),
+            self.active_fingerprint.is_some(),
+        ) {
+            // Row 3: No store, reader schema present, no fingerprint
+            (false, true, false)
+            // Row 7: Store is present, reader schema is resent, no fingerprint
+            | (true, true, false)
+            // Row 8: Store present, reader schema present, fingerprint present
+            | (true, true, true) => Ok(()),
+            // Fingerprint without a store (rows 2 & 4)
+            (false, _, true) => Err(ArrowError::InvalidArgumentError(
+                "Active fingerprint requires a writer schema store".into(),
+            )),
+            // Store present but no reader schema (rows 5 & 6)
+            (true, false, _) => Err(ArrowError::InvalidArgumentError(
+                "Reader schema must be set when writer schema store is 
provided".into(),
+            )),
+            // No schema store or reader schema provided (row 1)
+            (false, false, _) => Err(ArrowError::InvalidArgumentError(

Review Comment:
   @scovich So you were correct here. I was able to complete remove the 
`validate` method and ended up simplifying a lot of the logic. For instance I 
managed to slim down `make_decoder` to this:
   
   ```rust
       fn make_decoder(&self, header: Option<&Header>) -> Result<Decoder, 
ArrowError> {
           if let Some(hdr) = header {
               let writer_schema = hdr
                   .schema()
                   .map_err(|e| ArrowError::ExternalError(Box::new(e)))?
                   .ok_or_else(|| {
                       ArrowError::ParseError("No Avro schema present in file 
header".into())
                   })?;
               let record_decoder =
                   self.make_record_decoder(&writer_schema, 
self.reader_schema.as_ref())?;
               return Ok(self.make_decoder_with_parts(record_decoder, None, 
None, None));
           }
           let writer_schema_store = 
self.writer_schema_store.as_ref().ok_or_else(|| {
               ArrowError::ParseError("Writer schema store required for raw 
Avro".into())
           })?;
           let fingerprint = self
               .active_fingerprint
               .or_else(|| 
writer_schema_store.fingerprints().into_iter().next())
               .ok_or_else(|| {
                   ArrowError::ParseError(
                       "Writer schema store must contain at least one 
schema".into(),
                   )
               })?;
           let writer_schema = 
writer_schema_store.lookup(&fingerprint).ok_or_else(|| {
               ArrowError::ParseError("Active fingerprint not found in schema 
store".into())
           })?;
           let record_decoder =
               self.make_record_decoder(writer_schema, 
self.reader_schema.as_ref())?;
           Ok(self.make_decoder_with_parts(
               record_decoder,
               Some(fingerprint),
               self.reader_schema.clone(),
               self.writer_schema_store.clone(),
           ))
       }
   ```
   
   We ended up at a much better place. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to