nevi-me commented on a change in pull request #7917: URL: https://github.com/apache/arrow/pull/7917#discussion_r471964910
########## File path: rust/parquet/src/arrow/schema.rs ########## @@ -83,12 +90,77 @@ where .map(|fields| Schema::new_with_metadata(fields, metadata)) } +/// Try to convert Arrow schema metadata into a schema +fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Option<Schema> { + let decoded = base64::decode(encoded_meta); + match decoded { + Ok(bytes) => { + let slice = if bytes[0..4] == [255u8; 4] { + &bytes[8..] + } else { + bytes.as_slice() + }; + let message = arrow::ipc::get_root_as_message(slice); + message + .header_as_schema() + .map(arrow::ipc::convert::fb_to_schema) + } + Err(err) => { + // The C++ implementation returns an error if the schema can't be parsed. + // To prevent this, we explicitly log this, then compute the schema without the metadata + eprintln!( + "Unable to decode the encoded schema stored in {}, {:?}", + super::ARROW_SCHEMA_META_KEY, + err + ); + None + } + } +} + +/// Mutates writer metadata by encoding the Arrow schema and storing it in the metadata. +/// If there is an existing Arrow schema metadata, it is replaced. +pub fn add_encoded_arrow_schema_to_metadata( + schema: &Schema, + props: &mut WriterProperties, +) { + let mut serialized_schema = arrow::ipc::writer::schema_to_bytes(&schema); + let schema_len = serialized_schema.len(); + let mut len_prefix_schema = Vec::with_capacity(schema_len + 8); + len_prefix_schema.append(&mut vec![255u8, 255, 255, 255]); Review comment: The metadata length has been included since 0.15, but Rust fell behind on the implementation. Without the payload, pyarrow fails to read the file. I opened a JIRA to address the IPC issues, so I'll change this when I get to the JIRA (hoping before the next release) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org