[GitHub] [arrow-rs] tustvold commented on a diff in pull request #4389: Truncate Min/Max values in the Column Index

via GitHub Sun, 11 Jun 2023 05:17:11 -0700


tustvold commented on code in PR #4389:
URL: https://github.com/apache/arrow-rs/pull/4389#discussion_r1225793090



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1152,9 +1213,78 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: 
&[u8]) -> bool {
     (a[1..]) > (b[1..])
 }
 
+/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 
string, while being less than `length` bytes.
+fn truncate_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
+    // We return values like that at an earlier stage in the process.
+    assert!(data.len() >= length);
+    let mut char_indices = data.char_indices();
+
+    // We know `data` is a valid UTF8 encoded string, which means it has at 
least one valid UTF8 byte, which will make this loop exist.
+    while let Some((idx, c)) = char_indices.next_back() {
+        let split_point = idx + c.len_utf8();
+        if split_point <= length {
+            return data.as_bytes()[0..split_point].to_vec().into();
+        }
+    }
+
+    unreachable!()
+}
+
+/// Truncate a binary slice to make sure its length is less than `length`
+fn truncate_binary(data: &[u8], length: usize) -> Option<Vec<u8>> {
+    // We return values like that at an earlier stage in the process.
+    assert!(data.len() >= length);
+    // If all bytes are already maximal, no need to truncate
+    if data.iter().all(|b| *b == u8::MAX) {
+        None
+    } else {
+        data[0..length].to_vec().into()
+    }

Review Comment:
   ```suggestion
       data[0..length].to_vec().into()
   ```



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1152,9 +1213,78 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: 
&[u8]) -> bool {
     (a[1..]) > (b[1..])
 }
 
+/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 
string, while being less than `length` bytes.
+fn truncate_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
+    // We return values like that at an earlier stage in the process.
+    assert!(data.len() >= length);
+    let mut char_indices = data.char_indices();
+
+    // We know `data` is a valid UTF8 encoded string, which means it has at 
least one valid UTF8 byte, which will make this loop exist.
+    while let Some((idx, c)) = char_indices.next_back() {
+        let split_point = idx + c.len_utf8();
+        if split_point <= length {
+            return data.as_bytes()[0..split_point].to_vec().into();
+        }
+    }
+
+    unreachable!()
+}
+
+/// Truncate a binary slice to make sure its length is less than `length`
+fn truncate_binary(data: &[u8], length: usize) -> Option<Vec<u8>> {
+    // We return values like that at an earlier stage in the process.
+    assert!(data.len() >= length);
+    // If all bytes are already maximal, no need to truncate
+    if data.iter().all(|b| *b == u8::MAX) {

Review Comment:
   But that is only because the increment step can't proceed - which is what 
will return the error?



##########
parquet/src/column/writer/mod.rs:
##########
@@ -2220,12 +2348,219 @@ mod tests {
         );
     }
 
+    /// Verify min/max value truncation in the column index works as expected
+    #[test]
+    fn test_column_offset_index_metadata_truncating() {
+        // write data
+        // and check the offset index and column index
+        let page_writer = get_test_page_writer();
+        let props = Default::default();
+        let mut writer =
+            get_test_column_writer::<FixedLenByteArrayType>(page_writer, 0, 0, 
props);
+
+        let mut data = vec![FixedLenByteArray::default(); 3];
+        // This is the expected min value - "aaa..."
+        data[0].set_data(ByteBufferPtr::new(vec![97_u8; 200]));
+        // This is the expected max value - "ZZZ..."
+        data[1].set_data(ByteBufferPtr::new(vec![112_u8; 200]));
+        data[2].set_data(ByteBufferPtr::new(vec![98_u8; 200]));
+
+        writer.write_batch(&data, None, None).unwrap();
+
+        writer.flush_data_pages().unwrap();
+
+        let r = writer.close().unwrap();
+        let column_index = r.column_index.unwrap();
+        let offset_index = r.offset_index.unwrap();
+
+        assert_eq!(3, r.rows_written);
+
+        // column index
+        assert_eq!(1, column_index.null_pages.len());
+        assert_eq!(1, offset_index.page_locations.len());
+        assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order);
+        assert!(!column_index.null_pages[0]);
+        assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]);
+
+        if let Some(stats) = r.metadata.statistics() {
+            assert!(stats.has_min_max_set());
+            assert_eq!(stats.null_count(), 0);
+            assert_eq!(stats.distinct_count(), None);
+            if let Statistics::FixedLenByteArray(stats) = stats {
+                let column_index_min_value = 
column_index.min_values.get(0).unwrap();
+                let column_index_max_value = 
column_index.max_values.get(0).unwrap();
+
+                // Column index stats are truncated, while the column chunk's 
aren't.
+                assert_ne!(stats.min_bytes(), 
column_index_min_value.as_slice());
+                assert_ne!(stats.max_bytes(), 
column_index_max_value.as_slice());
+
+                assert_eq!(
+                    column_index_min_value.len(),
+                    DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH.unwrap()
+                );
+                assert_eq!(column_index_min_value.as_slice(), &[97_u8; 64]);
+                assert_eq!(
+                    column_index_max_value.len(),
+                    DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH.unwrap()
+                );
+
+                // We expect the last byte to be incremented
+                assert_eq!(
+                    *column_index_max_value.last().unwrap(),
+                    *column_index_max_value.first().unwrap() + 1
+                );
+            } else {
+                panic!("expecting Statistics::FixedLenByteArray");
+            }
+        } else {
+            panic!("metadata missing statistics");
+        }
+    }
+
+    #[test]
+    fn test_column_offset_index_truncating_spec_example() {
+        // write data
+        // and check the offset index and column index
+        let page_writer = get_test_page_writer();
+
+        // Truncate values at 1 byte
+        let builder =
+            
WriterProperties::builder().set_column_index_truncate_length(Some(1));
+        let props = Arc::new(builder.build());
+        let mut writer =
+            get_test_column_writer::<FixedLenByteArrayType>(page_writer, 0, 0, 
props);
+
+        let mut data = vec![FixedLenByteArray::default(); 1];
+        // This is the expected min value
+        data[0].set_data(ByteBufferPtr::new(
+            String::from("Blart Versenwald III").into_bytes(),
+        ));
+
+        writer.write_batch(&data, None, None).unwrap();
+
+        writer.flush_data_pages().unwrap();
+
+        let r = writer.close().unwrap();
+        let column_index = r.column_index.unwrap();
+        let offset_index = r.offset_index.unwrap();
+
+        assert_eq!(1, r.rows_written);
+
+        // column index
+        assert_eq!(1, column_index.null_pages.len());
+        assert_eq!(1, offset_index.page_locations.len());
+        assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order);
+        assert!(!column_index.null_pages[0]);
+        assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]);
+
+        if let Some(stats) = r.metadata.statistics() {
+            assert!(stats.has_min_max_set());
+            assert_eq!(stats.null_count(), 0);
+            assert_eq!(stats.distinct_count(), None);
+            if let Statistics::FixedLenByteArray(_stats) = stats {
+                let column_index_min_value = 
column_index.min_values.get(0).unwrap();
+                let column_index_max_value = 
column_index.max_values.get(0).unwrap();
+
+                assert_eq!(column_index_min_value.len(), 1);
+                assert_eq!(column_index_max_value.len(), 1);
+
+                assert_eq!("B".as_bytes(), column_index_min_value.as_slice());
+                assert_eq!("C".as_bytes(), column_index_max_value.as_slice());
+
+                assert_ne!(column_index_min_value, stats.min_bytes());
+                assert_ne!(column_index_max_value, stats.max_bytes());
+            } else {
+                panic!("expecting Statistics::FixedLenByteArray");
+            }
+        } else {
+            panic!("metadata missing statistics");
+        }
+    }
+
     #[test]
     fn test_send() {
         fn test<T: Send>() {}
         test::<ColumnWriterImpl<Int32Type>>();
     }
 
+    #[test]
+    fn test_increment() {
+        let v = increment(vec![0, 0, 0]).unwrap();
+        assert_eq!(&v, &[0, 0, 1]);
+
+        // Handle overflow
+        let v = increment(vec![0, 255, 255]).unwrap();
+        assert_eq!(&v, &[1, 0, 0]);
+
+        // Return `None` if all bytes are u8::MAX
+        let v = increment(vec![255, 255, 255]);
+        assert!(v.is_none());
+    }
+
+    #[test]
+    fn test_increment_utf8() {
+        // Basic ASCII case
+        let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
+        assert_eq!(&v, "hellp".as_bytes());
+
+        // Also show that BinaryArray level comparison works here
+        let mut greater = ByteArray::new();
+        greater.set_data(ByteBufferPtr::new(v));
+        let mut original = ByteArray::new();
+        original.set_data(ByteBufferPtr::new("hello".as_bytes().to_vec()));
+        assert!(greater > original);
+
+        // UTF8 string
+        let s = "❤️🧡💛💚💙💜";
+        let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+
+        if let Ok(new) = String::from_utf8(v) {
+            assert_ne!(&new, s);
+            assert_eq!(new, "❤️🧡💛💚💙💝");
+            assert!(new.as_bytes().last().unwrap() > 
s.as_bytes().last().unwrap());
+        } else {
+            panic!("Expected incremented UTF8 string to also be valid.")
+        }
+
+        // Max UTF8 character - should be a No-Op
+        let s = char::MAX.to_string();
+        assert_eq!(s.len(), 4);
+        let v = increment_utf8(s.as_bytes().to_vec());
+        assert!(v.is_none());
+
+        // Handle multi-byte UTF8 characters
+        let s = "a\u{10ffff}";
+        let v = increment_utf8(s.as_bytes().to_vec());
+        assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes());
+    }
+
+    #[test]
+    fn test_truncate_utf8() {
+        // No-op
+        let data = "❤️🧡💛💚💙💜";
+        let r = truncate_utf8(data, data.as_bytes().len()).unwrap();
+        assert_eq!(r.len(), data.as_bytes().len());
+        assert_eq!(&r, data.as_bytes());
+        println!("len is {}", data.len());
+
+        // We slice it away from the UTF8 boundary
+        let r = truncate_utf8(data, 13).unwrap();
+        assert_eq!(r.len(), 10);
+        assert_eq!(&r, "❤️🧡".as_bytes());
+    }
+
+    #[test]
+    fn test_truncate_max_binary_chars() {
+        let r =
+            truncate_binary(&[0xFF, 0xFE, 0xFD, 0xFF, 0xFF, 0xFF], 
5).and_then(increment);
+
+        assert_eq!(&r.unwrap(), &[0xFF, 0xFE, 0xFE, 0x00, 0x00]);
+
+        // Can't truncate this slice.
+        let r = truncate_binary(&[0xFF, 0xFF, 0xFF, 0xFF], 3);

Review Comment:
   ```suggestion
           let r = truncate_binary(&[0xFF, 0xFF, 0xFF, 0xFF], 
3).and_then(increment);
           assert!(r.is_none());
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #4389: Truncate Min/Max values in the Column Index

Reply via email to