jecsand838 commented on code in PR #8759:
URL: https://github.com/apache/arrow-rs/pull/8759#discussion_r2719817620
##########
arrow-avro/src/reader/mod.rs:
##########
@@ -749,7 +749,7 @@ impl Decoder {
buf: &[u8],
magic: &[u8; MAGIC_LEN],
fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
- ) -> Result<Option<usize>, ArrowError> {
+ ) -> Result<Option<usize>> {
Review Comment:
This maybe just me, but not specifying the error enum being passed back
makes the code a bit trickier for me to grok. Especially now that we have
multiple error enums.
##########
arrow-avro/src/errors.rs:
##########
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Common Avro errors and macros.
+
+use arrow_schema::ArrowError;
+use core::num::TryFromIntError;
+use std::error::Error;
+use std::string::FromUtf8Error;
+use std::{io, result, str};
+
+/// Avro error enumeration
+
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum AvroError {
+ /// General Avro error.
+ /// Returned when code violates normal workflow of working with Avro data.
+ General(String),
+ /// "Not yet implemented" Avro error.
+ /// Returned when functionality is not yet available.
+ NYI(String),
+ /// "End of file" Avro error.
+ /// Returned when IO related failures occur, e.g. when there are not
enough bytes to
+ /// decode.
+ EOF(String),
+ /// Arrow error.
+ /// Returned when reading into arrow or writing from arrow.
+ ArrowError(Box<ArrowError>),
+ /// Error when the requested index is more than the
+ /// number of items expected
+ IndexOutOfBound(usize, usize),
+ /// Error indicating that an unexpected or bad argument was passed to a
function.
+ InvalidArgument(String),
+ /// Error indicating that a value could not be parsed.
+ ParseError(String),
+ /// Error indicating that a schema is invalid.
+ SchemaError(String),
+ /// An external error variant
+ External(Box<dyn Error + Send + Sync>),
+ /// Error during IO operations
+ IoError(String, io::Error),
+ /// Returned when a function needs more data to complete properly. The
`usize` field indicates
+ /// the total number of bytes required, not the number of additional bytes.
+ NeedMoreData(usize),
+ /// Returned when a function needs more data to complete properly.
+ /// The `Range<u64>` indicates the range of bytes that are needed.
+ NeedMoreDataRange(std::ops::Range<u64>),
+}
+
+impl std::fmt::Display for AvroError {
+ fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+ match &self {
+ AvroError::General(message) => {
+ write!(fmt, "Avro error: {message}")
+ }
+ AvroError::NYI(message) => write!(fmt, "NYI: {message}"),
+ AvroError::EOF(message) => write!(fmt, "EOF: {message}"),
+ AvroError::ArrowError(message) => write!(fmt, "Arrow: {message}"),
+ AvroError::IndexOutOfBound(index, bound) => {
+ write!(fmt, "Index {index} out of bound: {bound}")
+ }
+ AvroError::InvalidArgument(message) => {
+ write!(fmt, "Invalid argument: {message}")
+ }
+ AvroError::ParseError(message) => write!(fmt, "Parse error:
{message}"),
+ AvroError::SchemaError(message) => write!(fmt, "Schema error:
{message}"),
+ AvroError::External(e) => write!(fmt, "External: {e}"),
+ AvroError::IoError(message, e) => write!(fmt, "I/O Error:
{message}: {e}"),
+ AvroError::NeedMoreData(needed) => write!(fmt, "NeedMoreData:
{needed}"),
+ AvroError::NeedMoreDataRange(range) => {
+ write!(fmt, "NeedMoreDataRange: {}..{}", range.start,
range.end)
+ }
+ }
+ }
+}
+
+impl Error for AvroError {
+ fn source(&self) -> Option<&(dyn Error + 'static)> {
+ match self {
+ AvroError::External(e) => Some(e.as_ref()),
+ AvroError::ArrowError(e) => Some(e.as_ref()),
+ AvroError::IoError(_, e) => Some(e),
+ _ => None,
+ }
+ }
+}
+
+impl From<TryFromIntError> for AvroError {
+ fn from(e: TryFromIntError) -> AvroError {
+ AvroError::General(format!("Integer overflow: {e}"))
+ }
+}
+
+impl From<io::Error> for AvroError {
+ fn from(e: io::Error) -> AvroError {
+ AvroError::External(Box::new(e))
+ }
+}
+
+impl From<str::Utf8Error> for AvroError {
+ fn from(e: str::Utf8Error) -> AvroError {
+ AvroError::External(Box::new(e))
+ }
+}
+
+impl From<FromUtf8Error> for AvroError {
+ fn from(e: FromUtf8Error) -> AvroError {
+ AvroError::External(Box::new(e))
+ }
+}
+
+impl From<ArrowError> for AvroError {
+ fn from(e: ArrowError) -> Self {
+ AvroError::ArrowError(Box::new(e))
+ }
+}
+
+/// A specialized `Result` for Avro errors.
+pub type Result<T, E = AvroError> = result::Result<T, E>;
Review Comment:
I'm thinking we should remove this imo. From a reviewer perspective, the
custom `Result` caused some confusion in correctly understanding the diff.
```suggestion
```
##########
arrow-avro/src/lib.rs:
##########
@@ -195,6 +195,9 @@ pub mod compression;
/// Avro data types and Arrow data types.
pub mod codec;
+/// AvroError variants
+pub mod errors;
Review Comment:
Same here, if internal only, then should we make this change?
```suggestion
pub(crate) mod errors;
```
##########
arrow-avro/src/reader/cursor.rs:
##########
@@ -43,88 +43,81 @@ impl<'a> AvroCursor<'a> {
/// Read a single `u8`
#[inline]
- pub(crate) fn get_u8(&mut self) -> Result<u8, ArrowError> {
+ pub(crate) fn get_u8(&mut self) -> Result<u8> {
match self.buf.first().copied() {
Some(x) => {
self.buf = &self.buf[1..];
Ok(x)
}
- None => Err(ArrowError::ParseError("Unexpected EOF".to_string())),
+ None => Err(AvroError::EOF("Unexpected EOF".to_string())),
}
}
#[inline]
- pub(crate) fn get_bool(&mut self) -> Result<bool, ArrowError> {
+ pub(crate) fn get_bool(&mut self) -> Result<bool> {
Ok(self.get_u8()? != 0)
}
- pub(crate) fn read_vlq(&mut self) -> Result<u64, ArrowError> {
- let (val, offset) = read_varint(self.buf)
- .ok_or_else(|| ArrowError::ParseError("bad varint".to_string()))?;
+ pub(crate) fn read_vlq(&mut self) -> Result<u64> {
+ let (val, offset) =
+ read_varint(self.buf).ok_or_else(|| AvroError::ParseError("bad
varint".to_string()))?;
self.buf = &self.buf[offset..];
Ok(val)
}
#[inline]
- pub(crate) fn get_int(&mut self) -> Result<i32, ArrowError> {
+ pub(crate) fn get_int(&mut self) -> Result<i32> {
let varint = self.read_vlq()?;
let val: u32 = varint
.try_into()
- .map_err(|_| ArrowError::ParseError("varint
overflow".to_string()))?;
+ .map_err(|_| AvroError::ParseError("varint
overflow".to_string()))?;
Ok((val >> 1) as i32 ^ -((val & 1) as i32))
}
#[inline]
- pub(crate) fn get_long(&mut self) -> Result<i64, ArrowError> {
+ pub(crate) fn get_long(&mut self) -> Result<i64> {
let val = self.read_vlq()?;
Ok((val >> 1) as i64 ^ -((val & 1) as i64))
}
- pub(crate) fn get_bytes(&mut self) -> Result<&'a [u8], ArrowError> {
- let len: usize = self.get_long()?.try_into().map_err(|_| {
- ArrowError::ParseError("offset overflow reading avro
bytes".to_string())
- })?;
+ pub(crate) fn get_bytes(&mut self) -> Result<&'a [u8]> {
+ let len: usize = self
+ .get_long()?
+ .try_into()
+ .map_err(|_| AvroError::ParseError("offset overflow reading avro
bytes".to_string()))?;
if self.buf.len() < len {
- return Err(ArrowError::ParseError(
- "Unexpected EOF reading bytes".to_string(),
- ));
+ return Err(AvroError::EOF("Unexpected EOF reading
bytes".to_string()));
}
let ret = &self.buf[..len];
self.buf = &self.buf[len..];
Ok(ret)
}
#[inline]
- pub(crate) fn get_float(&mut self) -> Result<f32, ArrowError> {
+ pub(crate) fn get_float(&mut self) -> Result<f32> {
if self.buf.len() < 4 {
- return Err(ArrowError::ParseError(
- "Unexpected EOF reading float".to_string(),
- ));
+ return Err(AvroError::EOF("Unexpected EOF reading
float".to_string()));
}
let ret = f32::from_le_bytes(self.buf[..4].try_into().unwrap());
self.buf = &self.buf[4..];
Ok(ret)
}
#[inline]
- pub(crate) fn get_double(&mut self) -> Result<f64, ArrowError> {
+ pub(crate) fn get_double(&mut self) -> Result<f64> {
if self.buf.len() < 8 {
- return Err(ArrowError::ParseError(
- "Unexpected EOF reading float".to_string(),
- ));
+ return Err(AvroError::EOF("Unexpected EOF reading
float".to_string()));
Review Comment:
I know this was pre-existing, but maybe worth taking the chance to correct?
```suggestion
return Err(AvroError::EOF("Unexpected EOF reading
double".to_string()));
```
##########
arrow-avro/src/reader/mod.rs:
##########
@@ -1023,13 +1023,13 @@ impl ReaderBuilder {
&self,
header: Option<&Header>,
reader_schema: Option<&AvroSchema>,
- ) -> Result<Decoder, ArrowError> {
+ ) -> Result<Decoder> {
if let Some(hdr) = header {
let writer_schema = hdr
.schema()
- .map_err(|e| ArrowError::ExternalError(Box::new(e)))?
+ .map_err(|e| AvroError::External(Box::new(e)))?
Review Comment:
Because `schema` returns `Result<Option<Schema<'_>>> // Result<_,
AvroError>` (the custom `Result` made this less readable for me), the closure
`|e| AvroError::External(Box::new(e))` is still wrapping an `AvroError` inside
`AvroError::External`.
I think @martin-g called this out in another comment as well.
You can probably improve this just by something like this:
```rust
let writer_schema = hdr
.schema()?
.ok_or_else(|| AvroError::ParseError("No Avro schema present
in file header".into()))?;
```
##########
arrow-avro/src/reader/header.rs:
##########
@@ -310,7 +310,7 @@ mod test {
let mut decoder = HeaderDecoder::default();
decoder.decode(b"Ob").unwrap();
let err = decoder.decode(b"s").unwrap_err().to_string();
- assert_eq!(err, "Parser error: Incorrect avro magic");
+ assert_eq!(err, "Parse error: Incorrect avro magic");
Review Comment:
Is there a reason we needed to change the test expectation here?
##########
arrow-avro/src/reader/mod.rs:
##########
@@ -835,7 +835,7 @@ impl Decoder {
// We must flush the active decoder before switching to the pending
one.
let batch = self.flush_and_reset();
self.apply_pending_schema();
- batch
+ batch.map_err(ArrowError::from)
Review Comment:
I'm actually curious, why are we not following the pattern used by `parquet`
by returning the `pub AvroError`? I thought that was the big reason for doing
this PR?
I re-read the PR description and just caught this part, not sure I fully
agree. I'm thinking we should either return `AvroError` or
`ArrowError::AvroError` as part of the public API to align with the other
crates and help downstream callers.
##########
arrow-avro/src/errors.rs:
##########
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Common Avro errors and macros.
+
+use arrow_schema::ArrowError;
+use core::num::TryFromIntError;
+use std::error::Error;
+use std::string::FromUtf8Error;
+use std::{io, result, str};
+
+/// Avro error enumeration
+
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum AvroError {
Review Comment:
If these errors are **internal** only, then why make this `pub`?
My preference would be to return `pub` if possible (like in the `parquet`
crate), but if we can't shouldn't we make this `pub(crate)`?
##########
arrow-avro/src/writer/encoder.rs:
##########
@@ -178,15 +172,15 @@ fn write_sign_extended<W: Write + ?Sized>(
let mut rem = pad_len;
while rem >= pad.len() {
out.write_all(pad)
- .map_err(|e| ArrowError::IoError(format!("write decimal fixed:
{e}"), e))?;
+ .map_err(|e| AvroError::General(format!("write decimal fixed:
{e}")))?;
rem -= pad.len();
}
if rem > 0 {
out.write_all(&pad[..rem])
- .map_err(|e| ArrowError::IoError(format!("write decimal fixed:
{e}"), e))?;
+ .map_err(|e| AvroError::General(format!("write decimal fixed:
{e}")))?;
}
out.write_all(src_be)
- .map_err(|e| ArrowError::IoError(format!("write decimal fixed: {e}"),
e))
+ .map_err(|e| AvroError::General(format!("write decimal fixed: {e}")))
Review Comment:
We'll want to preserve the error context in these `map_err `'s as well imo.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]