alamb commented on code in PR #7309: URL: https://github.com/apache/arrow-rs/pull/7309#discussion_r2018967683
########## arrow-array/src/builder/generic_bytes_builder.rs: ########## @@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> { self.offsets_builder.append(self.next_offset()); } + /// Appends array values and null to this builder as is + /// (this means that underlying null values are copied as is). + #[inline] + pub fn append_array(&mut self, array: &GenericByteArray<T>) { + if array.len() == 0 { + return; + } + + let offsets = array.offsets(); + + // If the offsets are contiguous, we can append them directly avoiding the need to align Review Comment: when would this case happen (contiguous offsets)? It seems like it would only happen if you are `concat`ing two slices of the same original underlying array 🤔 ########## arrow-array/src/builder/generic_bytes_builder.rs: ########## @@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> { self.offsets_builder.append(self.next_offset()); } + /// Appends array values and null to this builder as is + /// (this means that underlying null values are copied as is). + #[inline] + pub fn append_array(&mut self, array: &GenericByteArray<T>) { + if array.len() == 0 { + return; + } + + let offsets = array.offsets(); + + // If the offsets are contiguous, we can append them directly avoiding the need to align + // them + if self.next_offset() == offsets[0] { + self.offsets_builder.append_slice(&offsets[1..]); + } else { + // Shifting all the offsets + let shift: T::Offset = self.next_offset() - offsets[0]; + + // Creating intermediate offsets instead of pushing each offset is faster + // (even if we make MutableBuffer to avoid updating length on each push + // and reserve the necessary capacity, it's still slower) + let mut intermediate = Vec::with_capacity(offsets.len() - 1); Review Comment: 🤔 maybe we need something like `extend_from_iter` in offsets builder (not for this PR) ########## arrow-array/src/builder/generic_bytes_builder.rs: ########## @@ -593,4 +635,101 @@ mod tests { &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] ) } + + #[test] + fn test_append_array_without_nulls() { + let input = vec![ + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", + "thank", "you", "for", "asking", + ]; + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr1); + builder.append_array(&arr2); + builder.append_array(&arr3); + + let actual = builder.finish(); + let expected = GenericStringArray::<i32>::from(input); + + assert_eq!(actual, expected); + } + + #[test] + fn test_append_array_with_nulls() { + let input = vec![ + Some("hello"), + None, + Some("how"), + None, + None, + None, + None, + Some("I"), + Some("am"), + Some("doing"), + Some("well"), + ]; + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr1); + builder.append_array(&arr2); + builder.append_array(&arr3); + + let actual = builder.finish(); + let expected = GenericStringArray::<i32>::from(input); + + assert_eq!(actual, expected); + } + + #[test] + fn test_append_empty_array() { + let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr); + let result = builder.finish(); + assert_eq!(result.len(), 0); + } + + #[test] + fn test_append_array_with_offset_not_starting_at_0() { + let input = vec![ + Some("hello"), + None, + Some("how"), + None, + None, + None, + None, + Some("I"), + Some("am"), + Some("doing"), + Some("well"), + ]; + let full_array = GenericStringArray::<i32>::from(input); + let sliced = full_array.slice(1, 4); + + assert_ne!(sliced.offsets()[0].as_usize(), 0); + assert_ne!(sliced.offsets().last(), full_array.offsets().last()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&sliced); + let actual = builder.finish(); + + let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); + + println!("actual: {:?}", actual); + println!("expected: {:?}", expected); + assert_eq!(actual, expected); + } + + #[test] + fn test_append_underlying_null_values_added_as_is() { + // TODO Review Comment: Do you meant to leave this as a TODO? ########## arrow-array/src/builder/generic_bytes_builder.rs: ########## @@ -593,4 +635,101 @@ mod tests { &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] ) } + + #[test] + fn test_append_array_without_nulls() { + let input = vec![ + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", + "thank", "you", "for", "asking", + ]; + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr1); + builder.append_array(&arr2); + builder.append_array(&arr3); + + let actual = builder.finish(); + let expected = GenericStringArray::<i32>::from(input); + + assert_eq!(actual, expected); + } + + #[test] + fn test_append_array_with_nulls() { + let input = vec![ + Some("hello"), + None, + Some("how"), + None, + None, + None, + None, + Some("I"), + Some("am"), + Some("doing"), + Some("well"), + ]; + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr1); + builder.append_array(&arr2); + builder.append_array(&arr3); + + let actual = builder.finish(); + let expected = GenericStringArray::<i32>::from(input); + + assert_eq!(actual, expected); + } + + #[test] + fn test_append_empty_array() { + let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr); + let result = builder.finish(); + assert_eq!(result.len(), 0); + } + + #[test] + fn test_append_array_with_offset_not_starting_at_0() { + let input = vec![ + Some("hello"), + None, + Some("how"), + None, + None, + None, + None, + Some("I"), + Some("am"), + Some("doing"), + Some("well"), + ]; + let full_array = GenericStringArray::<i32>::from(input); + let sliced = full_array.slice(1, 4); + + assert_ne!(sliced.offsets()[0].as_usize(), 0); + assert_ne!(sliced.offsets().last(), full_array.offsets().last()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&sliced); + let actual = builder.finish(); + + let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); + + println!("actual: {:?}", actual); + println!("expected: {:?}", expected); + assert_eq!(actual, expected); Review Comment: Minor but you can avoid printing the results except if the assert fails like ```suggestion assert_eq!(actual, expected, "actual: {actual:?}\nexpected: {expected:?}"); ``` ########## arrow/benches/concatenate_kernel.rs: ########## @@ -47,6 +47,26 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_concat(&v1, &v2)) }); + { Review Comment: As you added code for boolean and Utf8 arrays, can you also please add benchmarks for those cases too? It would be really nice (and faster to review/merge) if you added the new benchmarks in a separate PR ########## arrow-array/src/builder/generic_bytes_builder.rs: ########## @@ -593,4 +635,101 @@ mod tests { &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] ) } + + #[test] + fn test_append_array_without_nulls() { + let input = vec![ + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", + "thank", "you", "for", "asking", + ]; + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr1); + builder.append_array(&arr2); + builder.append_array(&arr3); + + let actual = builder.finish(); + let expected = GenericStringArray::<i32>::from(input); + + assert_eq!(actual, expected); + } + + #[test] + fn test_append_array_with_nulls() { + let input = vec![ + Some("hello"), + None, + Some("how"), + None, + None, + None, + None, + Some("I"), + Some("am"), + Some("doing"), + Some("well"), + ]; + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); + + let mut builder = GenericStringBuilder::<i32>::new(); + builder.append_array(&arr1); + builder.append_array(&arr2); + builder.append_array(&arr3); + + let actual = builder.finish(); + let expected = GenericStringArray::<i32>::from(input); + + assert_eq!(actual, expected); + } + + #[test] + fn test_append_empty_array() { Review Comment: Could you please also add a test where the offsets are contiguous? Maybe I missed it. For example, I think if you sliced some data off the end of an array and then tried to append it back -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org