tustvold commented on code in PR #2500:
URL: https://github.com/apache/arrow-rs/pull/2500#discussion_r949398304
##########
parquet/src/arrow/array_reader/map_array.rs:
##########
@@ -65,131 +85,122 @@ impl ArrayReader for MapArrayReader {
}
fn read_records(&mut self, batch_size: usize) -> Result<usize> {
- let key_len = self.key_reader.read_records(batch_size)?;
- let value_len = self.value_reader.read_records(batch_size)?;
- // Check that key and value have the same lengths
- if key_len != value_len {
- return Err(general_err!(
- "Map key and value should have the same lengths."
- ));
- }
- Ok(key_len)
+ self.reader.read_records(batch_size)
}
fn consume_batch(&mut self) -> Result<ArrayRef> {
- let key_array = self.key_reader.consume_batch()?;
- let value_array = self.value_reader.consume_batch()?;
-
- // Check that key and value have the same lengths
- let key_length = key_array.len();
- if key_length != value_array.len() {
- return Err(general_err!(
- "Map key and value should have the same lengths."
- ));
- }
-
- let def_levels = self
- .key_reader
- .get_def_levels()
- .ok_or_else(|| ArrowError("item_reader def levels are
None.".to_string()))?;
- let rep_levels = self
- .key_reader
- .get_rep_levels()
- .ok_or_else(|| ArrowError("item_reader rep levels are
None.".to_string()))?;
-
- if !((def_levels.len() == rep_levels.len()) && (rep_levels.len() ==
key_length)) {
- return Err(ArrowError(
- "Expected item_reader def_levels and rep_levels to be same
length as batch".to_string(),
- ));
- }
-
- let entry_data_type = if let ArrowType::Map(field, _) =
&self.data_type {
- field.data_type().clone()
- } else {
- return Err(ArrowError("Expected a map arrow type".to_string()));
- };
-
- let entry_data = ArrayDataBuilder::new(entry_data_type)
- .len(key_length)
- .add_child_data(key_array.into_data())
- .add_child_data(value_array.into_data());
- let entry_data = unsafe { entry_data.build_unchecked() };
-
- let entry_len = rep_levels.iter().filter(|level| **level == 0).count();
-
- // first item in each list has rep_level = 0, subsequent items have
rep_level = 1
- let mut offsets: Vec<i32> = Vec::new();
- let mut cur_offset = 0;
- def_levels.iter().zip(rep_levels).for_each(|(d, r)| {
- if *r == 0 || d == &self.map_def_level {
- offsets.push(cur_offset);
- }
- if d > &self.map_def_level {
- cur_offset += 1;
- }
- });
- offsets.push(cur_offset);
-
- let num_bytes = bit_util::ceil(offsets.len(), 8);
- // TODO: A useful optimization is to use the null count to fill with
- // 0 or null, to reduce individual bits set in a loop.
- // To favour dense data, set every slot to true, then unset
- let mut null_buf =
MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
- let null_slice = null_buf.as_slice_mut();
- let mut list_index = 0;
- for i in 0..rep_levels.len() {
- // If the level is lower than empty, then the slot is null.
- // When a list is non-nullable, its empty level = null level,
- // so this automatically factors that in.
- if rep_levels[i] == 0 && def_levels[i] < self.map_def_level {
- // should be empty list
- bit_util::unset_bit(null_slice, list_index);
- }
- if rep_levels[i] == 0 {
- list_index += 1;
- }
- }
- let value_offsets = Buffer::from(&offsets.to_byte_slice());
-
- // Now we can build array data
- let array_data = ArrayDataBuilder::new(self.data_type.clone())
- .len(entry_len)
- .add_buffer(value_offsets)
- .null_bit_buffer(Some(null_buf.into()))
- .add_child_data(entry_data);
-
- let array_data = unsafe { array_data.build_unchecked() };
-
- Ok(Arc::new(MapArray::from(array_data)))
+ // A MapArray is just a ListArray with a StructArray child
+ // we can therefore just alter the ArrayData
+ let array = self.reader.consume_batch().unwrap();
+ let data = array.data().clone();
+ let builder = data.into_builder().data_type(self.data_type.clone());
+ Ok(Arc::new(MapArray::from(unsafe {
+ builder.build_unchecked()
Review Comment:
Pretty much, will add a doc comment
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]