alamb commented on code in PR #8257: URL: https://github.com/apache/arrow-rs/pull/8257#discussion_r2326067523
########## parquet/src/column/writer/mod.rs: ########## @@ -4236,4 +4247,33 @@ mod tests { .unwrap(); ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path) } + + #[test] + fn test_page_v2_snappy_compression_fallback() { + // Test that PageV2 sets is_compressed to false when Snappy compression increases data size + let page_writer = TestPageWriter {}; + + // Create WriterProperties with PageV2 and Snappy compression + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_2_0) + // Disable dictionary to ensure data is written directly + .set_dictionary_enabled(false) + .set_compression(Compression::SNAPPY) + .build(); + + let mut column_writer = + get_test_column_writer::<ByteArrayType>(Box::new(page_writer), 0, 0, Arc::new(props)); + + // Create small, simple data that Snappy compression will likely increase in size + // due to compression overhead for very small data Review Comment: 👍 ########## parquet/src/column/writer/mod.rs: ########## @@ -1104,12 +1104,23 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { rep_levels_byte_len + def_levels_byte_len + values_data.buf.len(); // Data Page v2 compresses values only. - match self.compressor { + let is_compressed = match self.compressor { Some(ref mut cmpr) => { + let buffer_len = buffer.len(); cmpr.compress(&values_data.buf, &mut buffer)?; + if uncompressed_size <= buffer.len() - buffer_len { Review Comment: Yeah, I do think that would need to configurable as I think some people would treat a 10% size increase as a regression, even if they got a faster reader A good follow on ticket for sure -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org