This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 90d4f1d  ARROW-12493: Add support for writing dictionary arrays to CSV 
and JSON (#16)
90d4f1d is described below

commit 90d4f1d33e6644554c4db4592b8fa45f7c209356
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Apr 22 13:49:14 2021 +0100

    ARROW-12493: Add support for writing dictionary arrays to CSV and JSON (#16)
---
 arrow/src/csv/writer.rs  | 40 ++++++++++++++++++++++++--------
 arrow/src/json/writer.rs | 60 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs
index e9d8565..f2f4ce8 100644
--- a/arrow/src/csv/writer.rs
+++ b/arrow/src/csv/writer.rs
@@ -128,13 +128,13 @@ impl<W: Write> Writer<W> {
     /// Convert a record to a string vector
     fn convert(
         &self,
-        batch: &RecordBatch,
+        batch: &[ArrayRef],
         row_index: usize,
         buffer: &mut [String],
     ) -> Result<()> {
         // TODO: it'd be more efficient if we could create `record: Vec<&[u8]>
         for (col_index, item) in buffer.iter_mut().enumerate() {
-            let col = batch.column(col_index);
+            let col = &batch[col_index];
             if col.is_null(row_index) {
                 // write an empty value
                 *item = "".to_string();
@@ -274,10 +274,22 @@ impl<W: Write> Writer<W> {
             self.beginning = false;
         }
 
+        let columns: Vec<_> = batch
+            .columns()
+            .iter()
+            .map(|array| match array.data_type() {
+                DataType::Dictionary(_, value_type) => {
+                    crate::compute::kernels::cast::cast(array, &value_type)
+                        .expect("cannot cast dictionary to underlying values")
+                }
+                _ => array.clone(),
+            })
+            .collect();
+
         let mut buffer = vec!["".to_string(); batch.num_columns()];
 
         for row_index in 0..batch.num_rows() {
-            self.convert(batch, row_index, &mut buffer)?;
+            self.convert(columns.as_slice(), row_index, &mut buffer)?;
             self.writer.write_record(&buffer)?;
         }
         self.writer.flush()?;
@@ -420,6 +432,11 @@ mod tests {
             Field::new("c4", DataType::Boolean, true),
             Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), 
true),
             Field::new("c6", DataType::Time32(TimeUnit::Second), false),
+            Field::new(
+                "c7",
+                DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+                false,
+            ),
         ]);
 
         let c1 = StringArray::from(vec![
@@ -439,6 +456,8 @@ mod tests {
             None,
         );
         let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]);
+        let c7: DictionaryArray<Int32Type> =
+            vec!["cupcakes", "cupcakes", "foo"].into_iter().collect();
 
         let batch = RecordBatch::try_new(
             Arc::new(schema),
@@ -449,6 +468,7 @@ mod tests {
                 Arc::new(c4),
                 Arc::new(c5),
                 Arc::new(c6),
+                Arc::new(c7),
             ],
         )
         .unwrap();
@@ -466,13 +486,13 @@ mod tests {
         file.read_to_end(&mut buffer).unwrap();
 
         assert_eq!(
-            r#"c1,c2,c3,c4,c5,c6
-Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
-consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
-sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
-Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
-consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
-sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
+            r#"c1,c2,c3,c4,c5,c6,c7
+Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
+consectetur adipiscing 
elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
+sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
+Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
+consectetur adipiscing 
elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
+sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
 "#
             .to_string(),
             String::from_utf8(buffer).unwrap()
diff --git a/arrow/src/json/writer.rs b/arrow/src/json/writer.rs
index 27c1ff1..8587c1d 100644
--- a/arrow/src/json/writer.rs
+++ b/arrow/src/json/writer.rs
@@ -480,6 +480,12 @@ fn set_column_for_json_rows(
                     }
                 });
         }
+        DataType::Dictionary(_, value_type) => {
+            let slice = array.slice(0, row_count);
+            let hydrated = crate::compute::kernels::cast::cast(&slice, 
&value_type)
+                .expect("cannot cast dictionary to underlying values");
+            set_column_for_json_rows(rows, row_count, &hydrated, col_name)
+        }
         _ => {
             panic!("Unsupported datatype: {:#?}", array.data_type());
         }
@@ -681,8 +687,8 @@ mod tests {
     #[test]
     fn write_simple_rows() {
         let schema = Schema::new(vec![
-            Field::new("c1", DataType::Int32, false),
-            Field::new("c2", DataType::Utf8, false),
+            Field::new("c1", DataType::Int32, true),
+            Field::new("c2", DataType::Utf8, true),
         ]);
 
         let a = Int32Array::from(vec![Some(1), Some(2), Some(3), None, 
Some(5)]);
@@ -710,6 +716,56 @@ mod tests {
     }
 
     #[test]
+    fn write_dictionary() {
+        let schema = Schema::new(vec![
+            Field::new(
+                "c1",
+                DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+                true,
+            ),
+            Field::new(
+                "c2",
+                DataType::Dictionary(Box::new(DataType::Int8), 
Box::new(DataType::Utf8)),
+                true,
+            ),
+        ]);
+
+        let a: DictionaryArray<Int32Type> = vec![
+            Some("cupcakes"),
+            Some("foo"),
+            Some("foo"),
+            None,
+            Some("cupcakes"),
+        ]
+        .into_iter()
+        .collect();
+        let b: DictionaryArray<Int8Type> =
+            vec![Some("sdsd"), Some("sdsd"), None, Some("sd"), Some("sdsd")]
+                .into_iter()
+                .collect();
+
+        let batch =
+            RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), 
Arc::new(b)])
+                .unwrap();
+
+        let mut buf = Vec::new();
+        {
+            let mut writer = LineDelimitedWriter::new(&mut buf);
+            writer.write_batches(&[batch]).unwrap();
+        }
+
+        assert_eq!(
+            String::from_utf8(buf).unwrap(),
+            r#"{"c1":"cupcakes","c2":"sdsd"}
+{"c1":"foo","c2":"sdsd"}
+{"c1":"foo"}
+{"c2":"sd"}
+{"c1":"cupcakes","c2":"sdsd"}
+"#
+        );
+    }
+
+    #[test]
     fn write_timestamps() {
         let ts_string = "2018-11-13T17:11:10.011375885995";
         let ts_nanos = ts_string

Reply via email to