corasaurus-hex commented on code in PR #18457:
URL: https://github.com/apache/datafusion/pull/18457#discussion_r2501481054
##########
datafusion/datasource-arrow/src/file_format.rs:
##########
@@ -344,40 +382,68 @@ impl DataSink for ArrowFileSink {
}
}
+// Custom implementation of inferring schema. Should eventually be moved
upstream to arrow-rs.
+// See <https://github.com/apache/arrow-rs/issues/5021>
+
const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1'];
const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
-/// Custom implementation of inferring schema. Should eventually be moved
upstream to arrow-rs.
-/// See <https://github.com/apache/arrow-rs/issues/5021>
-async fn infer_schema_from_file_stream(
+async fn infer_ipc_schema(
mut stream: BoxStream<'static, object_store::Result<Bytes>>,
) -> Result<SchemaRef> {
- // Expected format:
- // <magic number "ARROW1"> - 6 bytes
- // <empty padding bytes [to 8 byte boundary]> - 2 bytes
- // <continuation: 0xFFFFFFFF> - 4 bytes, not present below v0.15.0
- // <metadata_size: int32> - 4 bytes
- // <metadata_flatbuffer: bytes>
- // <rest of file bytes>
-
- // So in first read we need at least all known sized sections,
- // which is 6 + 2 + 4 + 4 = 16 bytes.
+ // Expected IPC format is either:
+ //
+ // stream:
+ // <continuation: 0xFFFFFFFF> - 4 bytes (added in v0.15.0+)
+ // <metadata_size: int32> - 4 bytes
+ // <metadata_flatbuffer: bytes>
+ // <rest of file bytes>
+ //
+ // file:
+ // <magic number "ARROW1"> - 6 bytes
+ // <empty padding bytes [to 8 byte boundary]> - 2 bytes
+ // <stream format above>
+
+ // Perform the initial read such that we always have the metadata size
let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?;
- // Files should start with these magic bytes
- if bytes[0..6] != ARROW_MAGIC {
- return Err(ArrowError::ParseError(
- "Arrow file does not contain correct header".to_string(),
- ))?;
- }
-
- // Since continuation marker bytes added in later versions
- let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] ==
CONTINUATION_MARKER {
- (&bytes[12..16], 16)
+ // The preamble size is everything before the metadata size
+ let preamble_size = if bytes[0..6] == ARROW_MAGIC {
+ // File format starts with magic number "ARROW1"
+ if bytes[8..12] == CONTINUATION_MARKER {
+ // Continuation marker was added in v0.15.0
+ 12
+ } else {
+ // File format before v0.15.0
+ 8
+ }
+ } else if bytes[0..4] == CONTINUATION_MARKER {
+ // Stream format after v0.15.0 starts with continuation marker
+ 4
} else {
- (&bytes[8..12], 12)
+ // Stream format before v0.15.0 does not have a preamble
+ 0
};
+ infer_ipc_schema_ignoring_preamble_bytes(bytes, preamble_size,
stream).await
+}
+
+async fn infer_ipc_schema_ignoring_preamble_bytes(
+ bytes: Vec<u8>,
+ preamble_size: usize,
+ mut stream: BoxStream<'static, object_store::Result<Bytes>>,
+) -> Result<SchemaRef> {
+ let (meta_len, rest_of_bytes_start_index): ([u8; 4], usize) = (
+ bytes[preamble_size..preamble_size + 4]
+ .try_into()
+ .map_err(|err| {
+ ArrowError::ParseError(format!(
+ "Unable to read IPC message as metadata length: {err:?}"
+ ))
+ })?,
+ preamble_size + 4,
+ );
Review Comment:
fixed in the version I'm pushing up
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]