alamb commented on code in PR #5557:
URL: https://github.com/apache/arrow-rs/pull/5557#discussion_r1546705146


##########
parquet/src/arrow/buffer/offset_buffer.rs:
##########
@@ -125,18 +128,50 @@ impl<I: OffsetSizeTrait> OffsetBuffer<I> {
 
     /// Converts this into an [`ArrayRef`] with the provided `data_type` and 
`null_buffer`
     pub fn into_array(self, null_buffer: Option<Buffer>, data_type: ArrowType) 
-> ArrayRef {
-        let array_data_builder = ArrayDataBuilder::new(data_type)
-            .len(self.len())
-            .add_buffer(Buffer::from_vec(self.offsets))
-            .add_buffer(Buffer::from_vec(self.values))
-            .null_bit_buffer(null_buffer);
-
-        let data = match cfg!(debug_assertions) {
-            true => array_data_builder.build().unwrap(),
-            false => unsafe { array_data_builder.build_unchecked() },
-        };
-
-        make_array(data)
+        match data_type {
+            ArrowType::Utf8View => {
+                let mut builder = self.build_generic_byte_view();
+                Arc::new(builder.finish().to_stringview().unwrap())
+            }
+            ArrowType::BinaryView => {
+                let mut builder = self.build_generic_byte_view();
+                Arc::new(builder.finish())
+            }
+            _ => {
+                let array_data_builder = ArrayDataBuilder::new(data_type)
+                    .len(self.len())
+                    .add_buffer(Buffer::from_vec(self.offsets))
+                    .add_buffer(Buffer::from_vec(self.values))
+                    .null_bit_buffer(null_buffer);
+
+                let data = match cfg!(debug_assertions) {
+                    true => array_data_builder.build().unwrap(),
+                    false => unsafe { array_data_builder.build_unchecked() },
+                };
+
+                make_array(data)
+            }
+        }
+    }
+
+    fn build_generic_byte_view(&self) -> 
GenericByteViewBuilder<BinaryViewType> {
+        let mut builder = 
GenericByteViewBuilder::<BinaryViewType>::with_capacity(self.len());
+        for i in self.offsets.windows(2) {
+            let start = i[0];
+            let end = i[1];
+            let b = unsafe {
+                std::slice::from_raw_parts(
+                    self.values.as_ptr().offset(start.to_isize().unwrap()),
+                    (end - start).to_usize().unwrap(),
+                )
+            };
+            if b.is_empty() {
+                builder.append_null();
+            } else {
+                builder.append_value(b);

Review Comment:
   This call will effectively copy the (string) bytes over from the 
OffsetBuilder and is likely not much more efficient than `cast`ing from 
StringArray to `StringViewArray`.
   
    I think a better solution would be to 
   1. Create the view directly (maybe via a ViewsBuilder)
   1. Call  `Buffer::from_vec(self.values)` to take the underlying values 
directly without copying and then assemble the array from the data directly, as 
done for `into_array`
   
   However, I think the ideal solution will likely avoid using the 
`OffsetBuffer` entirely (and instead would reuse the underlying decompressed 
parquet page), so maybe this solution is good for now



##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -1222,6 +1227,31 @@ mod tests {
         roundtrip(batch, Some(SMALL_SIZE / 2));
     }
 
+    #[test]
+    fn arrow_writer_binary_view() {
+        let string_field = Field::new("a", DataType::Utf8View, false);
+        let binary_field = Field::new("b", DataType::BinaryView, false);
+        let schema = Schema::new(vec![string_field, binary_field]);
+
+        let raw_string_values = vec!["foo", "bar", "large payload over 12 
bytes", "lulu"];
+        let raw_binary_values = vec![
+            b"foo".to_vec(),
+            b"bar".to_vec(),
+            b"large payload over 12 bytes".to_vec(),
+            b"lulu".to_vec(),
+        ];
+
+        let string_view_values = StringViewArray::from(raw_string_values);
+        let binary_view_values = 
BinaryViewArray::from_iter_values(raw_binary_values);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![Arc::new(string_view_values), Arc::new(binary_view_values)],
+        )
+        .unwrap();
+
+        roundtrip(batch, Some(SMALL_SIZE / 2));

Review Comment:
   Can you also verify with a smaller batch size (seems SMALL_SIZE is 64) -- 
perhaps with `None` and `Some(SMALL_SIZE / 3)`?



##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -1222,6 +1227,31 @@ mod tests {
         roundtrip(batch, Some(SMALL_SIZE / 2));
     }
 
+    #[test]
+    fn arrow_writer_binary_view() {
+        let string_field = Field::new("a", DataType::Utf8View, false);
+        let binary_field = Field::new("b", DataType::BinaryView, false);
+        let schema = Schema::new(vec![string_field, binary_field]);
+
+        let raw_string_values = vec!["foo", "bar", "large payload over 12 
bytes", "lulu"];

Review Comment:
   Can you please add nulls to this dataset? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to