This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 5595019ba chore: add docs, part of #37 (#6496)
5595019ba is described below
commit 5595019bad5f7fa8c75106d31bf86c8c99d2822c
Author: ByteBaker <[email protected]>
AuthorDate: Wed Oct 2 15:39:51 2024 +0530
chore: add docs, part of #37 (#6496)
- add pragma `#![warn(missing_docs)]` to `parquet`
This is the final component in the effort to make Arrow
fully-documented. The entire project now generates warning
for missing docs, if any.
- `arrow-flight`: replace `tonic`'s deprecated `compile_with_config`
with suggested method
- new deprecation:
The following types were not used anywhere and were possibly strays.
They've been marked as deprecated and will be removed in future
versions.
- `parquet::data_types::SliceAsBytesDataType`
- `parquet::column::writer::Level`
---
arrow-flight/gen/src/main.rs | 6 +--
parquet/src/arrow/async_reader/metadata.rs | 1 +
parquet/src/basic.rs | 47 +++++++++++++++++++++++-
parquet/src/column/page.rs | 28 ++++++++++++++
parquet/src/column/reader.rs | 8 ++++
parquet/src/column/writer/mod.rs | 15 ++++++++
parquet/src/data_type.rs | 22 +++++++++++
parquet/src/errors.rs | 2 +
parquet/src/file/footer.rs | 2 +
parquet/src/file/metadata/mod.rs | 8 ++++
parquet/src/file/metadata/writer.rs | 1 +
parquet/src/file/page_index/index.rs | 28 ++++++++++++++
parquet/src/file/page_index/offset_index.rs | 3 ++
parquet/src/file/properties.rs | 2 +
parquet/src/file/reader.rs | 2 +-
parquet/src/file/statistics.rs | 14 ++++++-
parquet/src/file/writer.rs | 1 +
parquet/src/lib.rs | 3 +-
parquet/src/record/api.rs | 57 +++++++++++++++++++++++++++--
parquet/src/record/reader.rs | 16 ++++----
parquet/src/record/record_reader.rs | 3 +-
parquet/src/schema/types.rs | 11 ++++++
parquet/src/schema/visitor.rs | 2 +
parquet/src/thrift.rs | 2 +
24 files changed, 262 insertions(+), 22 deletions(-)
diff --git a/arrow-flight/gen/src/main.rs b/arrow-flight/gen/src/main.rs
index c4cb9dfec..a69134e7a 100644
--- a/arrow-flight/gen/src/main.rs
+++ b/arrow-flight/gen/src/main.rs
@@ -26,7 +26,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = Path::new("../format/Flight.proto");
tonic_build::configure()
- // protoc in unbuntu builder needs this option
+ // protoc in Ubuntu builder needs this option
.protoc_arg("--experimental_allow_proto3_optional")
.out_dir("src")
.compile_protos_with_config(prost_config(), &[proto_path],
&[proto_dir])?;
@@ -37,7 +37,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.open("src/arrow.flight.protocol.rs")?;
let mut buffer = String::new();
file.read_to_string(&mut buffer)?;
- // append warning that file was auto-generate
+ // append warning that file was auto-generated
let mut file = OpenOptions::new()
.write(true)
.truncate(true)
@@ -49,7 +49,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = Path::new("../format/FlightSql.proto");
tonic_build::configure()
- // protoc in ubuntu builder needs this option
+ // protoc in Ubuntu builder needs this option
.protoc_arg("--experimental_allow_proto3_optional")
.out_dir("src/sql")
.compile_protos_with_config(prost_config(), &[proto_path],
&[proto_dir])?;
diff --git a/parquet/src/arrow/async_reader/metadata.rs
b/parquet/src/arrow/async_reader/metadata.rs
index b7fac6fe7..b2c6159be 100644
--- a/parquet/src/arrow/async_reader/metadata.rs
+++ b/parquet/src/arrow/async_reader/metadata.rs
@@ -29,6 +29,7 @@ use std::ops::Range;
/// A data source that can be used with [`MetadataLoader`] to load
[`ParquetMetaData`]
pub trait MetadataFetch {
+ /// Fetches a range of bytes asynchronously
fn fetch(&mut self, range: Range<usize>) -> BoxFuture<'_, Result<Bytes>>;
}
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 8fde542f5..1926b8762 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -47,13 +47,21 @@ pub use crate::format::{
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[allow(non_camel_case_types)]
pub enum Type {
+ /// A boolean value.
BOOLEAN,
+ /// 32-bit signed integer.
INT32,
+ /// 64-bit signed integer.
INT64,
+ /// 96-bit signed integer for timestamps.
INT96,
+ /// IEEE 754 single-precision floating point value.
FLOAT,
+ /// IEEE 754 double-precision floating point value.
DOUBLE,
+ /// Arbitrary length byte array.
BYTE_ARRAY,
+ /// Fixed length byte array.
FIXED_LEN_BYTE_ARRAY,
}
@@ -70,6 +78,7 @@ pub enum Type {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum ConvertedType {
+ /// No type conversion.
NONE,
/// A BYTE_ARRAY actually contains UTF8 encoded chars.
UTF8,
@@ -171,31 +180,53 @@ pub enum ConvertedType {
/// [`ConvertedType`]. Please see the README.md for more details.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LogicalType {
+ /// A UTF8 encoded string.
String,
+ /// A map of key-value pairs.
Map,
+ /// A list of elements.
List,
+ /// A set of predefined values.
Enum,
+ /// A decimal value with a specified scale and precision.
Decimal {
+ /// The number of digits in the decimal.
scale: i32,
+ /// The location of the decimal point.
precision: i32,
},
+ /// A date stored as days since Unix epoch.
Date,
+ /// A time stored as [`TimeUnit`] since midnight.
Time {
+ /// Whether the time is adjusted to UTC.
is_adjusted_to_u_t_c: bool,
+ /// The unit of time.
unit: TimeUnit,
},
+ /// A timestamp stored as [`TimeUnit`] since Unix epoch.
Timestamp {
+ /// Whether the timestamp is adjusted to UTC.
is_adjusted_to_u_t_c: bool,
+ /// The unit of time.
unit: TimeUnit,
},
+ /// An integer with a specified bit width and signedness.
Integer {
+ /// The number of bits in the integer.
bit_width: i8,
+ /// Whether the integer is signed.
is_signed: bool,
},
+ /// An unknown logical type.
Unknown,
+ /// A JSON document.
Json,
+ /// A BSON document.
Bson,
+ /// A UUID.
Uuid,
+ /// A 16-bit floating point number.
Float16,
}
@@ -350,13 +381,21 @@ impl FromStr for Encoding {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum Compression {
+ /// No compression.
UNCOMPRESSED,
+ /// [Snappy
compression](https://en.wikipedia.org/wiki/Snappy_(compression))
SNAPPY,
+ /// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
GZIP(GzipLevel),
+ /// [LZO
compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
LZO,
+ /// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
BROTLI(BrotliLevel),
+ /// [LZ4 compression](https://lz4.org/),
[(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
LZ4,
+ /// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
ZSTD(ZstdLevel),
+ /// [LZ4 compression](https://lz4.org/).
LZ4_RAW,
}
@@ -447,16 +486,20 @@ impl FromStr for Compression {
}
// ----------------------------------------------------------------------
-// Mirrors `parquet::PageType`
-
+/// Mirrors [parquet::PageType]
+///
/// Available data pages for Parquet file format.
/// Note that some of the page types may not be supported.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum PageType {
+ /// Data page Parquet 1.0
DATA_PAGE,
+ /// Index page
INDEX_PAGE,
+ /// Dictionary page
DICTIONARY_PAGE,
+ /// Data page Parquet 2.0
DATA_PAGE_V2,
}
diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs
index e3931dfe9..5c866318e 100644
--- a/parquet/src/column/page.rs
+++ b/parquet/src/column/page.rs
@@ -31,29 +31,51 @@ use crate::format::PageHeader;
/// used to store uncompressed bytes of the page.
#[derive(Clone)]
pub enum Page {
+ /// Data page Parquet format v1.
DataPage {
+ /// The underlying data buffer
buf: Bytes,
+ /// Number of values in this page
num_values: u32,
+ /// Encoding for values in this page
encoding: Encoding,
+ /// Definition level encoding
def_level_encoding: Encoding,
+ /// Repetition level encoding
rep_level_encoding: Encoding,
+ /// Optional statistics for this page
statistics: Option<Statistics>,
},
+ /// Data page Parquet format v2.
DataPageV2 {
+ /// The underlying data buffer
buf: Bytes,
+ /// Number of values in this page
num_values: u32,
+ /// Encoding for values in this page
encoding: Encoding,
+ /// Number of null values in this page
num_nulls: u32,
+ /// Number of rows in this page
num_rows: u32,
+ /// Length of definition levels
def_levels_byte_len: u32,
+ /// Length of repetition levels
rep_levels_byte_len: u32,
+ /// Is this page compressed
is_compressed: bool,
+ /// Optional statistics for this page
statistics: Option<Statistics>,
},
+ /// Dictionary page.
DictionaryPage {
+ /// The underlying data buffer
buf: Bytes,
+ /// Number of values in this page
num_values: u32,
+ /// Encoding for values in this page
encoding: Encoding,
+ /// Is dictionary page sorted
is_sorted: bool,
},
}
@@ -235,11 +257,17 @@ impl CompressedPage {
/// Contains page write metrics.
pub struct PageWriteSpec {
+ /// The type of page being written
pub page_type: PageType,
+ /// The total size of the page, before compression
pub uncompressed_size: usize,
+ /// The compressed size of the page
pub compressed_size: usize,
+ /// The number of values in the page
pub num_values: u32,
+ /// The offset of the page in the column chunk
pub offset: u64,
+ /// The number of bytes written to the underlying sink
pub bytes_written: u64,
}
diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs
index 0c7cbb412..2b43b4c3e 100644
--- a/parquet/src/column/reader.rs
+++ b/parquet/src/column/reader.rs
@@ -34,13 +34,21 @@ pub(crate) mod decoder;
/// Column reader for a Parquet type.
pub enum ColumnReader {
+ /// Column reader for boolean type
BoolColumnReader(ColumnReaderImpl<BoolType>),
+ /// Column reader for int32 type
Int32ColumnReader(ColumnReaderImpl<Int32Type>),
+ /// Column reader for int64 type
Int64ColumnReader(ColumnReaderImpl<Int64Type>),
+ /// Column reader for int96 type
Int96ColumnReader(ColumnReaderImpl<Int96Type>),
+ /// Column reader for float type
FloatColumnReader(ColumnReaderImpl<FloatType>),
+ /// Column reader for double type
DoubleColumnReader(ColumnReaderImpl<DoubleType>),
+ /// Column reader for byte array type
ByteArrayColumnReader(ColumnReaderImpl<ByteArrayType>),
+ /// Column reader for fixed length byte array type
FixedLenByteArrayColumnReader(ColumnReaderImpl<FixedLenByteArrayType>),
}
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index e0d3abed8..6071b68c6 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -61,13 +61,21 @@ macro_rules! downcast_writer {
/// Column writer for a Parquet type.
pub enum ColumnWriter<'a> {
+ /// Column writer for boolean type
BoolColumnWriter(ColumnWriterImpl<'a, BoolType>),
+ /// Column writer for int32 type
Int32ColumnWriter(ColumnWriterImpl<'a, Int32Type>),
+ /// Column writer for int64 type
Int64ColumnWriter(ColumnWriterImpl<'a, Int64Type>),
+ /// Column writer for int96 (timestamp) type
Int96ColumnWriter(ColumnWriterImpl<'a, Int96Type>),
+ /// Column writer for float type
FloatColumnWriter(ColumnWriterImpl<'a, FloatType>),
+ /// Column writer for double type
DoubleColumnWriter(ColumnWriterImpl<'a, DoubleType>),
+ /// Column writer for byte array type
ByteArrayColumnWriter(ColumnWriterImpl<'a, ByteArrayType>),
+ /// Column writer for fixed length byte array type
FixedLenByteArrayColumnWriter(ColumnWriterImpl<'a, FixedLenByteArrayType>),
}
@@ -90,6 +98,11 @@ impl<'a> ColumnWriter<'a> {
}
}
+#[deprecated(
+ since = "54.0.0",
+ note = "Seems like a stray and nobody knows what's it for. Will be removed
in the next release."
+)]
+#[allow(missing_docs)]
pub enum Level {
Page,
Column,
@@ -309,6 +322,7 @@ impl<T: Default> ColumnMetrics<T> {
/// Typed column writer for a primitive column.
pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a,
ColumnValueEncoderImpl<T>>;
+/// Generic column writer for a primitive column.
pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
// Column writer properties
descr: ColumnDescPtr,
@@ -344,6 +358,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
}
impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
+ /// Returns a new instance of [`GenericColumnWriter`].
pub fn new(
descr: ColumnDescPtr,
props: WriterPropertiesPtr,
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index a3bcfd167..a3d0e3ce7 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -183,6 +183,7 @@ impl ByteArray {
)
}
+ /// Try to convert the byte array to a utf8 slice
pub fn as_utf8(&self) -> Result<&str> {
self.data
.as_ref()
@@ -349,20 +350,29 @@ impl From<FixedLenByteArray> for ByteArray {
pub enum Decimal {
/// Decimal backed by `i32`.
Int32 {
+ /// The underlying value
value: [u8; 4],
+ /// The total number of digits in the number
precision: i32,
+ /// The number of digits to the right of the decimal point
scale: i32,
},
/// Decimal backed by `i64`.
Int64 {
+ /// The underlying value
value: [u8; 8],
+ /// The total number of digits in the number
precision: i32,
+ /// The number of digits to the right of the decimal point
scale: i32,
},
/// Decimal backed by byte array.
Bytes {
+ /// The underlying value
value: ByteArray,
+ /// The total number of digits in the number
precision: i32,
+ /// The number of digits to the right of the decimal point
scale: i32,
},
}
@@ -1120,6 +1130,7 @@ pub(crate) mod private {
/// Contains the Parquet physical type information as well as the Rust
primitive type
/// presentation.
pub trait DataType: 'static + Send {
+ /// The physical type of the Parquet data type.
type T: private::ParquetValueType;
/// Returns Parquet physical type.
@@ -1130,20 +1141,24 @@ pub trait DataType: 'static + Send {
/// Returns size in bytes for Rust representation of the physical type.
fn get_type_size() -> usize;
+ /// Returns the underlying [`ColumnReaderImpl`] for the given
[`ColumnReader`].
fn get_column_reader(column_writer: ColumnReader) ->
Option<ColumnReaderImpl<Self>>
where
Self: Sized;
+ /// Returns the underlying [`ColumnWriterImpl`] for the given
[`ColumnWriter`].
fn get_column_writer(column_writer: ColumnWriter<'_>) ->
Option<ColumnWriterImpl<'_, Self>>
where
Self: Sized;
+ /// Returns a reference to the underlying [`ColumnWriterImpl`] for the
given [`ColumnWriter`].
fn get_column_writer_ref<'a, 'b: 'a>(
column_writer: &'b ColumnWriter<'a>,
) -> Option<&'b ColumnWriterImpl<'a, Self>>
where
Self: Sized;
+ /// Returns a mutable reference to the underlying [`ColumnWriterImpl`] for
the given
fn get_column_writer_mut<'a, 'b: 'a>(
column_writer: &'a mut ColumnWriter<'b>,
) -> Option<&'a mut ColumnWriterImpl<'b, Self>>
@@ -1152,12 +1167,18 @@ pub trait DataType: 'static + Send {
}
// Workaround bug in specialization
+#[deprecated(
+ since = "54.0.0",
+ note = "Seems like a stray and nobody knows what's it for. Will be removed
in 55.0.0"
+)]
+#[allow(missing_docs)]
pub trait SliceAsBytesDataType: DataType
where
Self::T: SliceAsBytes,
{
}
+#[allow(deprecated)]
impl<T> SliceAsBytesDataType for T
where
T: DataType,
@@ -1167,6 +1188,7 @@ where
macro_rules! make_type {
($name:ident, $reader_ident: ident, $writer_ident: ident, $native_ty:ty,
$size:expr) => {
+ #[doc = concat!("Parquet physical type: ", stringify!($name))]
#[derive(Clone)]
pub struct $name {}
diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs
index a242c9768..bb4d2543c 100644
--- a/parquet/src/errors.rs
+++ b/parquet/src/errors.rs
@@ -42,6 +42,8 @@ pub enum ParquetError {
/// Arrow error.
/// Returned when reading into arrow or writing from arrow.
ArrowError(String),
+ /// Error when the requested column index is more than the
+ /// number of columns in the row group
IndexOutOfBound(usize, usize),
/// An external error variant
External(Box<dyn Error + Send + Sync>),
diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs
index 3dd698e3d..bd31c9142 100644
--- a/parquet/src/file/footer.rs
+++ b/parquet/src/file/footer.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+//! Module for working with Parquet file footers.
+
use crate::errors::Result;
use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE};
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 30b17b6a2..5a2ccbc02 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -419,6 +419,7 @@ impl From<ParquetMetaData> for ParquetMetaDataBuilder {
}
}
+/// A key-value pair for [`FileMetaData`].
pub type KeyValue = crate::format::KeyValue;
/// Reference counted pointer for [`FileMetaData`].
@@ -722,6 +723,7 @@ impl RowGroupMetaDataBuilder {
self
}
+ /// Sets file offset for this row group.
pub fn set_file_offset(mut self, value: i64) -> Self {
self.0.file_offset = Some(value);
self
@@ -1409,6 +1411,7 @@ impl Default for ColumnIndexBuilder {
}
impl ColumnIndexBuilder {
+ /// Creates a new column index builder.
pub fn new() -> Self {
ColumnIndexBuilder {
null_pages: Vec::new(),
@@ -1458,6 +1461,7 @@ impl ColumnIndexBuilder {
}
}
+ /// Set the boundary order of the column index
pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
self.boundary_order = boundary_order;
}
@@ -1506,6 +1510,7 @@ impl Default for OffsetIndexBuilder {
}
impl OffsetIndexBuilder {
+ /// Creates a new offset index builder.
pub fn new() -> Self {
OffsetIndexBuilder {
offset_array: Vec::new(),
@@ -1516,17 +1521,20 @@ impl OffsetIndexBuilder {
}
}
+ /// Append the row count of the next page.
pub fn append_row_count(&mut self, row_count: i64) {
let current_page_row_index = self.current_first_row_index;
self.first_row_index_array.push(current_page_row_index);
self.current_first_row_index += row_count;
}
+ /// Append the offset and size of the next page.
pub fn append_offset_and_size(&mut self, offset: i64,
compressed_page_size: i32) {
self.offset_array.push(offset);
self.compressed_page_size_array.push(compressed_page_size);
}
+ /// Append the unencoded byte array data bytes of the next page.
pub fn append_unencoded_byte_array_data_bytes(
&mut self,
unencoded_byte_array_data_bytes: Option<i64>,
diff --git a/parquet/src/file/metadata/writer.rs
b/parquet/src/file/metadata/writer.rs
index 44328c635..69a939e00 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -286,6 +286,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
Self { buf, metadata }
}
+ /// Write the metadata to the buffer
pub fn finish(mut self) -> Result<()> {
let file_metadata = self.metadata.file_metadata();
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index 2f30abead..a66509e14 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -50,18 +50,31 @@ pub struct PageIndex<T> {
}
impl<T> PageIndex<T> {
+ /// Returns the minimum value in the page
+ ///
+ /// It is `None` when all values are null
pub fn min(&self) -> Option<&T> {
self.min.as_ref()
}
+
+ /// Returns the maximum value in the page
+ ///
+ /// It is `None` when all values are null
pub fn max(&self) -> Option<&T> {
self.max.as_ref()
}
+
+ /// Returns the number of null values in the page
pub fn null_count(&self) -> Option<i64> {
self.null_count
}
+
+ /// Returns the repetition level histogram for the page
pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
self.repetition_level_histogram.as_ref()
}
+
+ /// Returns the definition level histogram for the page
pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
self.definition_level_histogram.as_ref()
}
@@ -71,10 +84,16 @@ impl<T> PageIndex<T>
where
T: AsBytes,
{
+ /// Returns the minimum value in the page as bytes
+ ///
+ /// It is `None` when all values are null
pub fn max_bytes(&self) -> Option<&[u8]> {
self.max.as_ref().map(|x| x.as_bytes())
}
+ /// Returns the maximum value in the page as bytes
+ ///
+ /// It is `None` when all values are null
pub fn min_bytes(&self) -> Option<&[u8]> {
self.min.as_ref().map(|x| x.as_bytes())
}
@@ -90,13 +109,21 @@ pub enum Index {
/// will only return pageLocations without min_max index,
/// `NONE` represents this lack of index information
NONE,
+ /// Boolean type index
BOOLEAN(NativeIndex<bool>),
+ /// 32-bit integer type index
INT32(NativeIndex<i32>),
+ /// 64-bit integer type index
INT64(NativeIndex<i64>),
+ /// 96-bit integer type (timestamp) index
INT96(NativeIndex<Int96>),
+ /// 32-bit floating point type index
FLOAT(NativeIndex<f32>),
+ /// 64-bit floating point type index
DOUBLE(NativeIndex<f64>),
+ /// Byte array type index
BYTE_ARRAY(NativeIndex<ByteArray>),
+ /// Fixed length byte array type index
FIXED_LEN_BYTE_ARRAY(NativeIndex<FixedLenByteArray>),
}
@@ -155,6 +182,7 @@ pub struct NativeIndex<T: ParquetValueType> {
}
impl<T: ParquetValueType> NativeIndex<T> {
+ /// The physical data type of the column
pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE;
/// Creates a new [`NativeIndex`]
diff --git a/parquet/src/file/page_index/offset_index.rs
b/parquet/src/file/page_index/offset_index.rs
index 2ae346414..d48d1b6c0 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -24,7 +24,10 @@ use crate::format::{OffsetIndex, PageLocation};
/// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY
columns.
#[derive(Debug, Clone, PartialEq)]
pub struct OffsetIndexMetaData {
+ /// Vector of [`PageLocation`] objects, one per page in the chunk.
pub page_locations: Vec<PageLocation>,
+ /// Optional vector of unencoded page sizes, one per page in the chunk.
+ /// Only defined for BYTE_ARRAY columns.
pub unencoded_byte_array_data_bytes: Option<Vec<i64>>,
}
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 61f6390c9..efcb63258 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -64,7 +64,9 @@ pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> =
None;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum WriterVersion {
+ /// Parquet format version 1.0
PARQUET_1_0,
+ /// Parquet format version 2.0
PARQUET_2_0,
}
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index d8a61fafe..400441f0c 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -61,7 +61,7 @@ pub trait Length {
/// User provided implementations can implement more sophisticated behaviors
/// such as on-demand buffering or scan sharing.
pub trait ChunkReader: Length + Send + Sync {
- /// The concrete type of readers returned by this trait
+ /// The concrete type of reader returned by this trait
type T: Read;
/// Get a [`Read`] instance starting at the provided file offset
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 50ed06436..2e05b8336 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -81,9 +81,10 @@ pub(crate) mod private {
gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
}
-// Macro to generate methods create Statistics.
+/// Macro to generate methods to create Statistics.
macro_rules! statistics_new_func {
($func:ident, $vtype:ty, $stat:ident) => {
+ #[doc = concat!("Creates new statistics for `", stringify!($stat), "`
column type.")]
pub fn $func(
min: $vtype,
max: $vtype,
@@ -244,7 +245,7 @@ pub fn from_thrift(
})
}
-// Convert Statistics into Thrift definition.
+/// Convert Statistics into Thrift definition.
pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
let stats = stats?;
@@ -306,13 +307,21 @@ pub fn to_thrift(stats: Option<&Statistics>) ->
Option<TStatistics> {
/// [NativeIndex]: crate::file::page_index::index::NativeIndex
#[derive(Debug, Clone, PartialEq)]
pub enum Statistics {
+ /// Statistics for Boolean column
Boolean(ValueStatistics<bool>),
+ /// Statistics for Int32 column
Int32(ValueStatistics<i32>),
+ /// Statistics for Int64 column
Int64(ValueStatistics<i64>),
+ /// Statistics for Int96 column
Int96(ValueStatistics<Int96>),
+ /// Statistics for Float column
Float(ValueStatistics<f32>),
+ /// Statistics for Double column
Double(ValueStatistics<f64>),
+ /// Statistics for ByteArray column
ByteArray(ValueStatistics<ByteArray>),
+ /// Statistics for FixedLenByteArray column
FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
}
@@ -323,6 +332,7 @@ impl<T: ParquetValueType> From<ValueStatistics<T>> for
Statistics {
}
impl Statistics {
+ /// Creates new statistics for a column type
pub fn new<T: ParquetValueType>(
min: Option<T>,
max: Option<T>,
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 7b7bfa19c..afbe1e549 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -322,6 +322,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
}
}
+ /// Add a [`KeyValue`] to the file writer's metadata
pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue) {
self.kv_metadatas.push(kv_metadata);
}
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index a54d4a427..3b63845e7 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -82,6 +82,7 @@
//! [Logical Types]:
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
//! [object_store]: https://docs.rs/object_store/latest/object_store/
+#![warn(missing_docs)]
/// Defines a an item with an experimental public API
///
/// The module will not be documented, and will only be public if the
@@ -117,7 +118,7 @@ pub mod basic;
/// [parquet.thrift]:
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
// see parquet/CONTRIBUTING.md for instructions on regenerating
// Don't try clippy and format auto generated code
-#[allow(clippy::all)]
+#[allow(clippy::all, missing_docs)]
#[rustfmt::skip]
pub mod format;
diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs
index 85d96fd65..7a2e268b3 100644
--- a/parquet/src/record/api.rs
+++ b/parquet/src/record/api.rs
@@ -101,6 +101,7 @@ impl Row {
}
}
+ /// Converts the row into a JSON object.
#[cfg(any(feature = "json", test))]
pub fn to_json_value(&self) -> Value {
Value::Object(
@@ -134,25 +135,45 @@ impl<'a> Iterator for RowColumnIter<'a> {
/// Trait for type-safe convenient access to fields within a Row.
pub trait RowAccessor {
+ /// Try to get a boolean value at the given index.
fn get_bool(&self, i: usize) -> Result<bool>;
+ /// Try to get a byte value at the given index.
fn get_byte(&self, i: usize) -> Result<i8>;
+ /// Try to get a short value at the given index.
fn get_short(&self, i: usize) -> Result<i16>;
+ /// Try to get a int value at the given index.
fn get_int(&self, i: usize) -> Result<i32>;
+ /// Try to get a long value at the given index.
fn get_long(&self, i: usize) -> Result<i64>;
+ /// Try to get a ubyte value at the given index.
fn get_ubyte(&self, i: usize) -> Result<u8>;
+ /// Try to get a ushort value at the given index.
fn get_ushort(&self, i: usize) -> Result<u16>;
+ /// Try to get a uint value at the given index.
fn get_uint(&self, i: usize) -> Result<u32>;
+ /// Try to get a ulong value at the given index.
fn get_ulong(&self, i: usize) -> Result<u64>;
+ /// Try to get a float16 value at the given index.
fn get_float16(&self, i: usize) -> Result<f16>;
+ /// Try to get a float value at the given index.
fn get_float(&self, i: usize) -> Result<f32>;
+ /// Try to get a double value at the given index.
fn get_double(&self, i: usize) -> Result<f64>;
+ /// Try to get a date value at the given index.
fn get_timestamp_millis(&self, i: usize) -> Result<i64>;
+ /// Try to get a date value at the given index.
fn get_timestamp_micros(&self, i: usize) -> Result<i64>;
+ /// Try to get a decimal value at the given index.
fn get_decimal(&self, i: usize) -> Result<&Decimal>;
+ /// Try to get a string value at the given index.
fn get_string(&self, i: usize) -> Result<&String>;
+ /// Try to get a bytes value at the given index.
fn get_bytes(&self, i: usize) -> Result<&ByteArray>;
+ /// Try to get a group value at the given index.
fn get_group(&self, i: usize) -> Result<&Row>;
+ /// Try to get a list value at the given index.
fn get_list(&self, i: usize) -> Result<&List>;
+ /// Try to get a map value at the given index.
fn get_map(&self, i: usize) -> Result<&Map>;
}
@@ -175,6 +196,7 @@ pub trait RowAccessor {
/// ```
///
pub trait RowFormatter {
+ /// The method to format a field at the given index.
fn fmt(&self, i: usize) -> &dyn fmt::Display;
}
@@ -295,6 +317,7 @@ impl List {
self.elements.len()
}
+ /// Get the reference to the elements in this list
pub fn elements(&self) -> &[Field] {
self.elements.as_slice()
}
@@ -309,25 +332,47 @@ pub fn make_list(elements: Vec<Field>) -> List {
/// Trait for type-safe access of an index for a `List`.
/// Note that the get_XXX methods do not do bound checking.
pub trait ListAccessor {
+ /// Try getting a `boolean` value at the given index.
fn get_bool(&self, i: usize) -> Result<bool>;
+ /// Try getting a `byte` value at the given index.
fn get_byte(&self, i: usize) -> Result<i8>;
+ /// Try getting an `i16` value at the given index.
fn get_short(&self, i: usize) -> Result<i16>;
+ /// Try getting an `i32` value at the given index.
fn get_int(&self, i: usize) -> Result<i32>;
+ /// Try getting an `i64` value at the given index.
fn get_long(&self, i: usize) -> Result<i64>;
+ /// Try getting a `u8` value at the given index.
fn get_ubyte(&self, i: usize) -> Result<u8>;
+ /// Try getting a `u16` value at the given index.
fn get_ushort(&self, i: usize) -> Result<u16>;
+ /// Try getting a `u32` value at the given index.
fn get_uint(&self, i: usize) -> Result<u32>;
+ /// Try getting a `u64` value at the given index.
fn get_ulong(&self, i: usize) -> Result<u64>;
+ /// Try getting a `f16` value at the given index.
fn get_float16(&self, i: usize) -> Result<f16>;
+ /// Try getting a `f32` value at the given index.
fn get_float(&self, i: usize) -> Result<f32>;
+ /// Try getting a `f64` value at the given index.
fn get_double(&self, i: usize) -> Result<f64>;
+ /// Try getting a `timestamp` as milliseconds value
+ /// encoded as `i64` at the given index.
fn get_timestamp_millis(&self, i: usize) -> Result<i64>;
+ /// Try getting a `timestamp` as microseconds value
+ /// encoded as `i64` at the given index.
fn get_timestamp_micros(&self, i: usize) -> Result<i64>;
+ /// Try getting a `decimal` value at the given index.
fn get_decimal(&self, i: usize) -> Result<&Decimal>;
+ /// Try getting a `string` value at the given index.
fn get_string(&self, i: usize) -> Result<&String>;
+ /// Try getting a `bytes` value at the given index.
fn get_bytes(&self, i: usize) -> Result<&ByteArray>;
+ /// Try getting a `group` value at the given index.
fn get_group(&self, i: usize) -> Result<&Row>;
+ /// Try getting a `list` value at the given index.
fn get_list(&self, i: usize) -> Result<&List>;
+ /// Try getting a `map` value at the given index.
fn get_map(&self, i: usize) -> Result<&Map>;
}
@@ -420,6 +465,7 @@ impl Map {
self.entries.len()
}
+ /// Get the reference to the key-value pairs in this map
pub fn entries(&self) -> &[(Field, Field)] {
self.entries.as_slice()
}
@@ -433,7 +479,9 @@ pub fn make_map(entries: Vec<(Field, Field)>) -> Map {
/// Trait for type-safe access of an index for a `Map`
pub trait MapAccessor {
+ /// Get the keys of the map.
fn get_keys<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
+ /// Get the values of the map.
fn get_values<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
}
@@ -532,13 +580,13 @@ pub enum Field {
Int(i32),
/// Signed integer INT_64.
Long(i64),
- // Unsigned integer UINT_8.
+ /// Unsigned integer UINT_8.
UByte(u8),
- // Unsigned integer UINT_16.
+ /// Unsigned integer UINT_16.
UShort(u16),
- // Unsigned integer UINT_32.
+ /// Unsigned integer UINT_32.
UInt(u32),
- // Unsigned integer UINT_64.
+ /// Unsigned integer UINT_64.
ULong(u64),
/// IEEE 16-bit floating point value.
Float16(f16),
@@ -717,6 +765,7 @@ impl Field {
Ok(field)
}
+ /// Converts the Parquet field into a JSON [`Value`].
#[cfg(any(feature = "json", test))]
pub fn to_json_value(&self) -> Value {
use base64::prelude::BASE64_STANDARD;
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index cc29658d9..57469ee9c 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -304,18 +304,18 @@ impl TreeBuilder {
/// Reader tree for record assembly
pub enum Reader {
- // Primitive reader with type information and triplet iterator
+ /// Primitive reader with type information and triplet iterator
PrimitiveReader(TypePtr, Box<TripletIter>),
- // Optional reader with definition level of a parent and a reader
+ /// Optional reader with definition level of a parent and a reader
OptionReader(i16, Box<Reader>),
- // Group (struct) reader with type information, definition level and list
of child
- // readers. When it represents message type, type information is None
+ /// Group (struct) reader with type information, definition level and list
of child
+ /// readers. When it represents message type, type information is None
GroupReader(Option<TypePtr>, i16, Vec<Reader>),
- // Reader for repeated values, e.g. lists, contains type information,
definition
- // level, repetition level and a child reader
+ /// Reader for repeated values, e.g. lists, contains type information,
definition
+ /// level, repetition level and a child reader
RepeatedReader(TypePtr, i16, i16, Box<Reader>),
- // Reader of key-value pairs, e.g. maps, contains type information,
definition
- // level, repetition level, child reader for keys and child reader for
values
+ /// Reader of key-value pairs, e.g. maps, contains type information,
definition
+ /// level, repetition level, child reader for keys and child reader for
values
KeyValueReader(TypePtr, i16, i16, Box<Reader>, Box<Reader>),
}
diff --git a/parquet/src/record/record_reader.rs
b/parquet/src/record/record_reader.rs
index cfaf14a3d..75ca4e3e3 100644
--- a/parquet/src/record/record_reader.rs
+++ b/parquet/src/record/record_reader.rs
@@ -18,11 +18,12 @@
use super::super::errors::ParquetError;
use super::super::file::reader::RowGroupReader;
-/// Read up to `max_records` records from `row_group_reader` into `self`.
+/// Read up to `num_records` records from `row_group_reader` into `self`.
///
/// The type parameter `T` is used to work around the rust orphan rule
/// when implementing on types such as `Vec<T>`.
pub trait RecordReader<T> {
+ /// Read up to `num_records` records from `row_group_reader` into `self`.
fn read_from_row_group(
&mut self,
row_group_reader: &mut dyn RowGroupReader,
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 2665f28fe..39d2fa28c 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -45,15 +45,24 @@ pub type ColumnDescPtr = Arc<ColumnDescriptor>;
/// repetition is `None`.
#[derive(Clone, Debug, PartialEq)]
pub enum Type {
+ /// Represents a primitive leaf field.
PrimitiveType {
+ /// Basic information about the type.
basic_info: BasicTypeInfo,
+ /// Physical type of this primitive type.
physical_type: PhysicalType,
+ /// Length of this type.
type_length: i32,
+ /// Scale of this type.
scale: i32,
+ /// Precision of this type.
precision: i32,
},
+ /// Represents a group of fields (similar to struct).
GroupType {
+ /// Basic information about the type.
basic_info: BasicTypeInfo,
+ /// Fields of this group type.
fields: Vec<TypePtr>,
},
}
@@ -745,6 +754,7 @@ impl ColumnPath {
self.parts.append(&mut tail);
}
+ /// Returns a slice of path components.
pub fn parts(&self) -> &[String] {
&self.parts
}
@@ -1033,6 +1043,7 @@ impl SchemaDescriptor {
self.schema.as_ref()
}
+ /// Returns schema as [`TypePtr`] for cheap cloning.
pub fn root_schema_ptr(&self) -> TypePtr {
self.schema.clone()
}
diff --git a/parquet/src/schema/visitor.rs b/parquet/src/schema/visitor.rs
index 35fde11f1..7a10d3a5f 100644
--- a/parquet/src/schema/visitor.rs
+++ b/parquet/src/schema/visitor.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+//! Utilities to traverse against various parquet type.
+
use crate::basic::{ConvertedType, Repetition};
use crate::errors::ParquetError::General;
use crate::errors::Result;
diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs
index abb2ac13c..5be025f95 100644
--- a/parquet/src/thrift.rs
+++ b/parquet/src/thrift.rs
@@ -27,7 +27,9 @@ use thrift::protocol::{
///
/// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of
trait objects
pub trait TSerializable: Sized {
+ /// Reads the struct from the input Thrift protocol
fn read_from_in_protocol<T: TInputProtocol>(i_prot: &mut T) ->
thrift::Result<Self>;
+ /// Writes the struct to the output Thrift protocol
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()>;
}