This is an automated email from the ASF dual-hosted git repository. jiayuliu pushed a commit to branch add-bloom-filter-2 in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit d3d407b293091bd71c04f865b0c7c896ac52d452 Author: Jiayu Liu <[email protected]> AuthorDate: Sun Nov 13 13:24:10 2022 +0000 add api --- parquet/src/file/reader.rs | 6 ++++++ parquet/src/file/serialized_reader.rs | 15 +++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 70ff37a41..325944c21 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -21,6 +21,8 @@ use bytes::Bytes; use std::{boxed::Box, io::Read, sync::Arc}; +#[cfg(feature = "bloom")] +use crate::bloom_filter::Sbbf; use crate::column::page::PageIterator; use crate::column::{page::PageReader, reader::ColumnReader}; use crate::errors::{ParquetError, Result}; @@ -143,6 +145,10 @@ pub trait RowGroupReader: Send + Sync { Ok(col_reader) } + #[cfg(feature = "bloom")] + /// Get bloom filter for the `i`th column chunk, if present. + fn get_column_bloom_filter(&self, i: usize) -> Result<Option<Sbbf>>; + /// Get iterator of `Row`s from this row group. /// /// Projected schema can be a subset of or equal to the file schema, when it is None, diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index a400d4dab..8cefe1c5e 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -22,11 +22,9 @@ use std::collections::VecDeque; use std::io::Cursor; use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; -use crate::format::{PageHeader, PageLocation, PageType}; -use bytes::{Buf, Bytes}; -use thrift::protocol::TCompactInputProtocol; - use crate::basic::{Encoding, Type}; +#[cfg(feature = "bloom")] +use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; @@ -38,10 +36,13 @@ use crate::file::{ reader::*, statistics, }; +use crate::format::{PageHeader, PageLocation, PageType}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::util::{io::TryClone, memory::ByteBufferPtr}; +use bytes::{Buf, Bytes}; +use thrift::protocol::TCompactInputProtocol; // export `SliceableCursor` and `FileSource` publically so clients can // re-use the logic in their own ParquetFileWriter wrappers pub use crate::util::io::FileSource; @@ -387,6 +388,12 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' )?)) } + #[cfg(feature = "bloom")] + /// get bloom filter for the ith column + fn get_column_bloom_filter(&self, i: usize) -> Result<Option<Sbbf>> { + todo!() + } + fn get_row_iter(&self, projection: Option<SchemaType>) -> Result<RowIter> { RowIter::from_row_group(projection, self) }
