alamb commented on code in PR #5860:
URL: https://github.com/apache/arrow-rs/pull/5860#discussion_r1638724323
##########
parquet/Cargo.toml:
##########
@@ -68,6 +68,9 @@ twox-hash = { version = "1.6", default-features = false }
paste = { version = "1.0" }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
+dsi-progress-logger = { version = "0.2.4", optional = true }
Review Comment:
Could you please remove these new dependencies (even though I do realize
they are optional and won't be activated very often)
I think they will add some ongoing maintenance cost (keeping the
dependencies updated) which I would prefer to avoid if possible
##########
parquet/src/file/writer.rs:
##########
@@ -331,14 +315,17 @@ impl<W: Write + Send> SerializedFileWriter<W> {
self.finished = true;
let num_rows = self.row_groups.iter().map(|x| x.num_rows()).sum();
+ for row_group in &mut self.row_groups {
Review Comment:
```suggestion
// write out any remaining bloom filters after all row groups
for row_group in &mut self.row_groups {
```
##########
parquet/src/file/properties.rs:
##########
@@ -479,6 +508,12 @@ impl WriterPropertiesBuilder {
self
}
+ /// Sets where in the final file Bloom Filters are written
Review Comment:
```suggestion
/// Sets where in the final file Bloom Filters are written (default
`AfterRowGroup`)
```
##########
parquet/src/file/writer.rs:
##########
@@ -197,18 +199,28 @@ impl<W: Write + Send> SerializedFileWriter<W> {
self.row_group_index += 1;
+ let bloom_filter_position = self.properties().bloom_filter_position();
let row_groups = &mut self.row_groups;
let row_bloom_filters = &mut self.bloom_filters;
let row_column_indexes = &mut self.column_indexes;
let row_offset_indexes = &mut self.offset_indexes;
- let on_close =
- |metadata, row_group_bloom_filter, row_group_column_index,
row_group_offset_index| {
- row_groups.push(metadata);
- row_bloom_filters.push(row_group_bloom_filter);
- row_column_indexes.push(row_group_column_index);
- row_offset_indexes.push(row_group_offset_index);
- Ok(())
+ let on_close = move |buf,
+ mut metadata,
+ row_group_bloom_filter,
+ row_group_column_index,
+ row_group_offset_index| {
+ row_bloom_filters.push(row_group_bloom_filter);
+ row_column_indexes.push(row_group_column_index);
+ row_offset_indexes.push(row_group_offset_index);
+ match bloom_filter_position {
Review Comment:
```suggestion
// write bloom filters out immediately after the row group if
requested
match bloom_filter_position {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]