This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c498eb74853 feat: support encoding of binary in CSV writer (#5782)
c498eb74853 is described below
commit c498eb74853f5b307b89f1123262fc5f5e1c890e
Author: Trevor Hilton <[email protected]>
AuthorDate: Mon May 20 06:10:27 2024 -0400
feat: support encoding of binary in CSV writer (#5782)
Allows for writing binary (Binary, LargeBinary, and FixedSizeBinary) to
CSV. Note: FixedSizeBinary was already being supported in this way.
Values are encoded as HEX, by using the default Arrow formatter.
A test was added that accounts for null values when encoding all three
binary types in CSV.
---
arrow-csv/src/writer.rs | 74 ++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 64 insertions(+), 10 deletions(-)
diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs
index a31a1d5e8c1..5edb93bcb64 100644
--- a/arrow-csv/src/writer.rs
+++ b/arrow-csv/src/writer.rs
@@ -131,15 +131,15 @@ impl<W: Write> Writer<W> {
let converters = batch
.columns()
.iter()
- .map(|a| match a.data_type() {
- d if d.is_nested() => Err(ArrowError::CsvError(format!(
- "Nested type {} is not supported in CSV",
- a.data_type()
- ))),
- DataType::Binary | DataType::LargeBinary =>
Err(ArrowError::CsvError(
- "Binary data cannot be written to CSV".to_string(),
- )),
- _ => ArrayFormatter::try_new(a.as_ref(), &options),
+ .map(|a| {
+ if a.data_type().is_nested() {
+ Err(ArrowError::CsvError(format!(
+ "Nested type {} is not supported in CSV",
+ a.data_type()
+ )))
+ } else {
+ ArrayFormatter::try_new(a.as_ref(), &options)
+ }
})
.collect::<Result<Vec<_>, ArrowError>>()?;
@@ -425,7 +425,10 @@ mod tests {
use super::*;
use crate::ReaderBuilder;
- use arrow_array::builder::{Decimal128Builder, Decimal256Builder};
+ use arrow_array::builder::{
+ BinaryBuilder, Decimal128Builder, Decimal256Builder,
FixedSizeBinaryBuilder,
+ LargeBinaryBuilder,
+ };
use arrow_array::types::*;
use arrow_buffer::i256;
use std::io::{Cursor, Read, Seek};
@@ -759,4 +762,55 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
String::from_utf8(buffer).unwrap()
);
}
+
+ #[test]
+ fn test_write_csv_binary() {
+ let fixed_size = 8;
+ let schema = SchemaRef::new(Schema::new(vec![
+ Field::new("c1", DataType::Binary, true),
+ Field::new("c2", DataType::FixedSizeBinary(fixed_size), true),
+ Field::new("c3", DataType::LargeBinary, true),
+ ]));
+ let mut c1_builder = BinaryBuilder::new();
+ c1_builder.append_value(b"Homer");
+ c1_builder.append_value(b"Bart");
+ c1_builder.append_null();
+ c1_builder.append_value(b"Ned");
+ let mut c2_builder = FixedSizeBinaryBuilder::new(fixed_size);
+ c2_builder.append_value(b"Simpson ").unwrap();
+ c2_builder.append_value(b"Simpson ").unwrap();
+ c2_builder.append_null();
+ c2_builder.append_value(b"Flanders").unwrap();
+ let mut c3_builder = LargeBinaryBuilder::new();
+ c3_builder.append_null();
+ c3_builder.append_null();
+ c3_builder.append_value(b"Comic Book Guy");
+ c3_builder.append_null();
+
+ let batch = RecordBatch::try_new(
+ schema,
+ vec![
+ Arc::new(c1_builder.finish()) as ArrayRef,
+ Arc::new(c2_builder.finish()) as ArrayRef,
+ Arc::new(c3_builder.finish()) as ArrayRef,
+ ],
+ )
+ .unwrap();
+
+ let mut buf = Vec::new();
+ let builder = WriterBuilder::new();
+ let mut writer = builder.build(&mut buf);
+ writer.write(&batch).unwrap();
+ drop(writer);
+ assert_eq!(
+ "\
+ c1,c2,c3\n\
+ 486f6d6572,53696d70736f6e20,\n\
+ 42617274,53696d70736f6e20,\n\
+ ,,436f6d696320426f6f6b20477579\n\
+ 4e6564,466c616e64657273,\n\
+ ",
+ String::from_utf8(buf).unwrap()
+ );
+ }
}