This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/active_release by this push:
new be471fd allocate enough bytes when writing booleans (#658) (#677)
be471fd is described below
commit be471fd7859dd6c885e537169aa099df4c63a9d8
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Aug 9 07:25:32 2021 -0400
allocate enough bytes when writing booleans (#658) (#677)
* allocate enough bytes when writing booleans
* round up to nearest multiple of 256
Co-authored-by: Ben Chambers <[email protected]>
---
parquet/src/arrow/arrow_writer.rs | 28 +++++++++++++++++++++++++++-
parquet/src/data_type.rs | 8 +++++++-
2 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer.rs
b/parquet/src/arrow/arrow_writer.rs
index 3ff1304..fcd8086 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -218,7 +218,7 @@ fn write_leaves(
ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_) => {
Err(ParquetError::NYI(
format!(
- "Attempting to write an Arrow type {:?} to parquet that is
not yet implemented",
+ "Attempting to write an Arrow type {:?} to parquet that is
not yet implemented",
array.data_type()
)
))
@@ -1161,6 +1161,32 @@ mod tests {
}
#[test]
+ fn bool_large_single_column() {
+ let values = Arc::new(
+ [None, Some(true), Some(false)]
+ .iter()
+ .cycle()
+ .copied()
+ .take(200_000)
+ .collect::<BooleanArray>(),
+ );
+ let schema =
+ Schema::new(vec![Field::new("col", values.data_type().clone(),
true)]);
+ let expected_batch =
+ RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap();
+ let file = get_temp_file("bool_large_single_column", &[]);
+
+ let mut writer = ArrowWriter::try_new(
+ file.try_clone().unwrap(),
+ expected_batch.schema(),
+ None,
+ )
+ .expect("Unable to write file");
+ writer.write(&expected_batch).unwrap();
+ writer.close().unwrap();
+ }
+
+ #[test]
fn i8_single_column() {
required_and_optional::<Int8Array, _>(0..SMALL_SIZE as i8,
"i8_single_column");
}
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index 127ba95..3573362 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -588,6 +588,7 @@ pub(crate) mod private {
use crate::util::bit_util::{BitReader, BitWriter};
use crate::util::memory::ByteBufferPtr;
+ use arrow::util::bit_util::round_upto_power_of_2;
use byteorder::ByteOrder;
use std::convert::TryInto;
@@ -669,7 +670,12 @@ pub(crate) mod private {
bit_writer: &mut BitWriter,
) -> Result<()> {
if bit_writer.bytes_written() + values.len() / 8 >=
bit_writer.capacity() {
- bit_writer.extend(256);
+ let bits_available =
+ (bit_writer.capacity() - bit_writer.bytes_written()) * 8;
+ let bits_needed = values.len() - bits_available;
+ let bytes_needed = (bits_needed + 7) / 8;
+ let bytes_needed = round_upto_power_of_2(bytes_needed, 256);
+ bit_writer.extend(bytes_needed);
}
for value in values {
if !bit_writer.put_value(*value as u64, 1) {