(arrow-rs) branch master updated: chore: add docs, part of #37 (#6496)

alamb Wed, 02 Oct 2024 03:10:41 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 5595019ba chore: add docs, part of #37 (#6496)
5595019ba is described below

commit 5595019bad5f7fa8c75106d31bf86c8c99d2822c
Author: ByteBaker <[email protected]>
AuthorDate: Wed Oct 2 15:39:51 2024 +0530

    chore: add docs, part of #37 (#6496)
    
    - add pragma `#![warn(missing_docs)]` to `parquet`
    
    This is the final component in the effort to make Arrow
    fully-documented. The entire project now generates warning
    for missing docs, if any.
    
    - `arrow-flight`: replace `tonic`'s deprecated `compile_with_config`
    with suggested method
    
    - new deprecation:
    The following types were not used anywhere and were possibly strays.
    They've been marked as deprecated and will be removed in future
    versions.
    
    - `parquet::data_types::SliceAsBytesDataType`
    - `parquet::column::writer::Level`
---
 arrow-flight/gen/src/main.rs                |  6 +--
 parquet/src/arrow/async_reader/metadata.rs  |  1 +
 parquet/src/basic.rs                        | 47 +++++++++++++++++++++++-
 parquet/src/column/page.rs                  | 28 ++++++++++++++
 parquet/src/column/reader.rs                |  8 ++++
 parquet/src/column/writer/mod.rs            | 15 ++++++++
 parquet/src/data_type.rs                    | 22 +++++++++++
 parquet/src/errors.rs                       |  2 +
 parquet/src/file/footer.rs                  |  2 +
 parquet/src/file/metadata/mod.rs            |  8 ++++
 parquet/src/file/metadata/writer.rs         |  1 +
 parquet/src/file/page_index/index.rs        | 28 ++++++++++++++
 parquet/src/file/page_index/offset_index.rs |  3 ++
 parquet/src/file/properties.rs              |  2 +
 parquet/src/file/reader.rs                  |  2 +-
 parquet/src/file/statistics.rs              | 14 ++++++-
 parquet/src/file/writer.rs                  |  1 +
 parquet/src/lib.rs                          |  3 +-
 parquet/src/record/api.rs                   | 57 +++++++++++++++++++++++++++--
 parquet/src/record/reader.rs                | 16 ++++----
 parquet/src/record/record_reader.rs         |  3 +-
 parquet/src/schema/types.rs                 | 11 ++++++
 parquet/src/schema/visitor.rs               |  2 +
 parquet/src/thrift.rs                       |  2 +
 24 files changed, 262 insertions(+), 22 deletions(-)

diff --git a/arrow-flight/gen/src/main.rs b/arrow-flight/gen/src/main.rs
index c4cb9dfec..a69134e7a 100644
--- a/arrow-flight/gen/src/main.rs
+++ b/arrow-flight/gen/src/main.rs
@@ -26,7 +26,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let proto_path = Path::new("../format/Flight.proto");
 
     tonic_build::configure()
-        // protoc in unbuntu builder needs this option
+        // protoc in Ubuntu builder needs this option
         .protoc_arg("--experimental_allow_proto3_optional")
         .out_dir("src")
         .compile_protos_with_config(prost_config(), &[proto_path], 
&[proto_dir])?;
@@ -37,7 +37,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         .open("src/arrow.flight.protocol.rs")?;
     let mut buffer = String::new();
     file.read_to_string(&mut buffer)?;
-    // append warning that file was auto-generate
+    // append warning that file was auto-generated
     let mut file = OpenOptions::new()
         .write(true)
         .truncate(true)
@@ -49,7 +49,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let proto_path = Path::new("../format/FlightSql.proto");
 
     tonic_build::configure()
-        // protoc in ubuntu builder needs this option
+        // protoc in Ubuntu builder needs this option
         .protoc_arg("--experimental_allow_proto3_optional")
         .out_dir("src/sql")
         .compile_protos_with_config(prost_config(), &[proto_path], 
&[proto_dir])?;
diff --git a/parquet/src/arrow/async_reader/metadata.rs 
b/parquet/src/arrow/async_reader/metadata.rs
index b7fac6fe7..b2c6159be 100644
--- a/parquet/src/arrow/async_reader/metadata.rs
+++ b/parquet/src/arrow/async_reader/metadata.rs
@@ -29,6 +29,7 @@ use std::ops::Range;
 
 /// A data source that can be used with [`MetadataLoader`] to load 
[`ParquetMetaData`]
 pub trait MetadataFetch {
+    /// Fetches a range of bytes asynchronously
     fn fetch(&mut self, range: Range<usize>) -> BoxFuture<'_, Result<Bytes>>;
 }
 
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 8fde542f5..1926b8762 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -47,13 +47,21 @@ pub use crate::format::{
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 #[allow(non_camel_case_types)]
 pub enum Type {
+    /// A boolean value.
     BOOLEAN,
+    /// 32-bit signed integer.
     INT32,
+    /// 64-bit signed integer.
     INT64,
+    /// 96-bit signed integer for timestamps.
     INT96,
+    /// IEEE 754 single-precision floating point value.
     FLOAT,
+    /// IEEE 754 double-precision floating point value.
     DOUBLE,
+    /// Arbitrary length byte array.
     BYTE_ARRAY,
+    /// Fixed length byte array.
     FIXED_LEN_BYTE_ARRAY,
 }
 
@@ -70,6 +78,7 @@ pub enum Type {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[allow(non_camel_case_types)]
 pub enum ConvertedType {
+    /// No type conversion.
     NONE,
     /// A BYTE_ARRAY actually contains UTF8 encoded chars.
     UTF8,
@@ -171,31 +180,53 @@ pub enum ConvertedType {
 /// [`ConvertedType`]. Please see the README.md for more details.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LogicalType {
+    /// A UTF8 encoded string.
     String,
+    /// A map of key-value pairs.
     Map,
+    /// A list of elements.
     List,
+    /// A set of predefined values.
     Enum,
+    /// A decimal value with a specified scale and precision.
     Decimal {
+        /// The number of digits in the decimal.
         scale: i32,
+        /// The location of the decimal point.
         precision: i32,
     },
+    /// A date stored as days since Unix epoch.
     Date,
+    /// A time stored as [`TimeUnit`] since midnight.
     Time {
+        /// Whether the time is adjusted to UTC.
         is_adjusted_to_u_t_c: bool,
+        /// The unit of time.
         unit: TimeUnit,
     },
+    /// A timestamp stored as [`TimeUnit`] since Unix epoch.
     Timestamp {
+        /// Whether the timestamp is adjusted to UTC.
         is_adjusted_to_u_t_c: bool,
+        /// The unit of time.
         unit: TimeUnit,
     },
+    /// An integer with a specified bit width and signedness.
     Integer {
+        /// The number of bits in the integer.
         bit_width: i8,
+        /// Whether the integer is signed.
         is_signed: bool,
     },
+    /// An unknown logical type.
     Unknown,
+    /// A JSON document.
     Json,
+    /// A BSON document.
     Bson,
+    /// A UUID.
     Uuid,
+    /// A 16-bit floating point number.
     Float16,
 }
 
@@ -350,13 +381,21 @@ impl FromStr for Encoding {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[allow(non_camel_case_types)]
 pub enum Compression {
+    /// No compression.
     UNCOMPRESSED,
+    /// [Snappy 
compression](https://en.wikipedia.org/wiki/Snappy_(compression))
     SNAPPY,
+    /// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
     GZIP(GzipLevel),
+    /// [LZO 
compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
     LZO,
+    /// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
     BROTLI(BrotliLevel),
+    /// [LZ4 compression](https://lz4.org/), 
[(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
     LZ4,
+    /// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
     ZSTD(ZstdLevel),
+    /// [LZ4 compression](https://lz4.org/).
     LZ4_RAW,
 }
 
@@ -447,16 +486,20 @@ impl FromStr for Compression {
 }
 
 // ----------------------------------------------------------------------
-// Mirrors `parquet::PageType`
-
+/// Mirrors [parquet::PageType]
+///
 /// Available data pages for Parquet file format.
 /// Note that some of the page types may not be supported.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[allow(non_camel_case_types)]
 pub enum PageType {
+    /// Data page Parquet 1.0
     DATA_PAGE,
+    /// Index page
     INDEX_PAGE,
+    /// Dictionary page
     DICTIONARY_PAGE,
+    /// Data page Parquet 2.0
     DATA_PAGE_V2,
 }
 
diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs
index e3931dfe9..5c866318e 100644
--- a/parquet/src/column/page.rs
+++ b/parquet/src/column/page.rs
@@ -31,29 +31,51 @@ use crate::format::PageHeader;
 /// used to store uncompressed bytes of the page.
 #[derive(Clone)]
 pub enum Page {
+    /// Data page Parquet format v1.
     DataPage {
+        /// The underlying data buffer
         buf: Bytes,
+        /// Number of values in this page
         num_values: u32,
+        /// Encoding for values in this page
         encoding: Encoding,
+        /// Definition level encoding
         def_level_encoding: Encoding,
+        /// Repetition level encoding
         rep_level_encoding: Encoding,
+        /// Optional statistics for this page
         statistics: Option<Statistics>,
     },
+    /// Data page Parquet format v2.
     DataPageV2 {
+        /// The underlying data buffer
         buf: Bytes,
+        /// Number of values in this page
         num_values: u32,
+        /// Encoding for values in this page
         encoding: Encoding,
+        /// Number of null values in this page
         num_nulls: u32,
+        /// Number of rows in this page
         num_rows: u32,
+        /// Length of definition levels
         def_levels_byte_len: u32,
+        /// Length of repetition levels
         rep_levels_byte_len: u32,
+        /// Is this page compressed
         is_compressed: bool,
+        /// Optional statistics for this page
         statistics: Option<Statistics>,
     },
+    /// Dictionary page.
     DictionaryPage {
+        /// The underlying data buffer
         buf: Bytes,
+        /// Number of values in this page
         num_values: u32,
+        /// Encoding for values in this page
         encoding: Encoding,
+        /// Is dictionary page sorted
         is_sorted: bool,
     },
 }
@@ -235,11 +257,17 @@ impl CompressedPage {
 
 /// Contains page write metrics.
 pub struct PageWriteSpec {
+    /// The type of page being written
     pub page_type: PageType,
+    /// The total size of the page, before compression
     pub uncompressed_size: usize,
+    /// The compressed size of the page
     pub compressed_size: usize,
+    /// The number of values in the page
     pub num_values: u32,
+    /// The offset of the page in the column chunk
     pub offset: u64,
+    /// The number of bytes written to the underlying sink
     pub bytes_written: u64,
 }
 
diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs
index 0c7cbb412..2b43b4c3e 100644
--- a/parquet/src/column/reader.rs
+++ b/parquet/src/column/reader.rs
@@ -34,13 +34,21 @@ pub(crate) mod decoder;
 
 /// Column reader for a Parquet type.
 pub enum ColumnReader {
+    /// Column reader for boolean type
     BoolColumnReader(ColumnReaderImpl<BoolType>),
+    /// Column reader for int32 type
     Int32ColumnReader(ColumnReaderImpl<Int32Type>),
+    /// Column reader for int64 type
     Int64ColumnReader(ColumnReaderImpl<Int64Type>),
+    /// Column reader for int96 type
     Int96ColumnReader(ColumnReaderImpl<Int96Type>),
+    /// Column reader for float type
     FloatColumnReader(ColumnReaderImpl<FloatType>),
+    /// Column reader for double type
     DoubleColumnReader(ColumnReaderImpl<DoubleType>),
+    /// Column reader for byte array type
     ByteArrayColumnReader(ColumnReaderImpl<ByteArrayType>),
+    /// Column reader for fixed length byte array type
     FixedLenByteArrayColumnReader(ColumnReaderImpl<FixedLenByteArrayType>),
 }
 
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index e0d3abed8..6071b68c6 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -61,13 +61,21 @@ macro_rules! downcast_writer {
 
 /// Column writer for a Parquet type.
 pub enum ColumnWriter<'a> {
+    /// Column writer for boolean type
     BoolColumnWriter(ColumnWriterImpl<'a, BoolType>),
+    /// Column writer for int32 type
     Int32ColumnWriter(ColumnWriterImpl<'a, Int32Type>),
+    /// Column writer for int64 type
     Int64ColumnWriter(ColumnWriterImpl<'a, Int64Type>),
+    /// Column writer for int96 (timestamp) type
     Int96ColumnWriter(ColumnWriterImpl<'a, Int96Type>),
+    /// Column writer for float type
     FloatColumnWriter(ColumnWriterImpl<'a, FloatType>),
+    /// Column writer for double type
     DoubleColumnWriter(ColumnWriterImpl<'a, DoubleType>),
+    /// Column writer for byte array type
     ByteArrayColumnWriter(ColumnWriterImpl<'a, ByteArrayType>),
+    /// Column writer for fixed length byte array type
     FixedLenByteArrayColumnWriter(ColumnWriterImpl<'a, FixedLenByteArrayType>),
 }
 
@@ -90,6 +98,11 @@ impl<'a> ColumnWriter<'a> {
     }
 }
 
+#[deprecated(
+    since = "54.0.0",
+    note = "Seems like a stray and nobody knows what's it for. Will be removed 
in the next release."
+)]
+#[allow(missing_docs)]
 pub enum Level {
     Page,
     Column,
@@ -309,6 +322,7 @@ impl<T: Default> ColumnMetrics<T> {
 /// Typed column writer for a primitive column.
 pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, 
ColumnValueEncoderImpl<T>>;
 
+/// Generic column writer for a primitive column.
 pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
     // Column writer properties
     descr: ColumnDescPtr,
@@ -344,6 +358,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
 }
 
 impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
+    /// Returns a new instance of [`GenericColumnWriter`].
     pub fn new(
         descr: ColumnDescPtr,
         props: WriterPropertiesPtr,
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index a3bcfd167..a3d0e3ce7 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -183,6 +183,7 @@ impl ByteArray {
         )
     }
 
+    /// Try to convert the byte array to a utf8 slice
     pub fn as_utf8(&self) -> Result<&str> {
         self.data
             .as_ref()
@@ -349,20 +350,29 @@ impl From<FixedLenByteArray> for ByteArray {
 pub enum Decimal {
     /// Decimal backed by `i32`.
     Int32 {
+        /// The underlying value
         value: [u8; 4],
+        /// The total number of digits in the number
         precision: i32,
+        /// The number of digits to the right of the decimal point
         scale: i32,
     },
     /// Decimal backed by `i64`.
     Int64 {
+        /// The underlying value
         value: [u8; 8],
+        /// The total number of digits in the number
         precision: i32,
+        /// The number of digits to the right of the decimal point
         scale: i32,
     },
     /// Decimal backed by byte array.
     Bytes {
+        /// The underlying value
         value: ByteArray,
+        /// The total number of digits in the number
         precision: i32,
+        /// The number of digits to the right of the decimal point
         scale: i32,
     },
 }
@@ -1120,6 +1130,7 @@ pub(crate) mod private {
 /// Contains the Parquet physical type information as well as the Rust 
primitive type
 /// presentation.
 pub trait DataType: 'static + Send {
+    /// The physical type of the Parquet data type.
     type T: private::ParquetValueType;
 
     /// Returns Parquet physical type.
@@ -1130,20 +1141,24 @@ pub trait DataType: 'static + Send {
     /// Returns size in bytes for Rust representation of the physical type.
     fn get_type_size() -> usize;
 
+    /// Returns the underlying [`ColumnReaderImpl`] for the given 
[`ColumnReader`].
     fn get_column_reader(column_writer: ColumnReader) -> 
Option<ColumnReaderImpl<Self>>
     where
         Self: Sized;
 
+    /// Returns the underlying [`ColumnWriterImpl`] for the given 
[`ColumnWriter`].
     fn get_column_writer(column_writer: ColumnWriter<'_>) -> 
Option<ColumnWriterImpl<'_, Self>>
     where
         Self: Sized;
 
+    /// Returns a reference to the underlying [`ColumnWriterImpl`] for the 
given [`ColumnWriter`].
     fn get_column_writer_ref<'a, 'b: 'a>(
         column_writer: &'b ColumnWriter<'a>,
     ) -> Option<&'b ColumnWriterImpl<'a, Self>>
     where
         Self: Sized;
 
+    /// Returns a mutable reference to the underlying [`ColumnWriterImpl`] for 
the given
     fn get_column_writer_mut<'a, 'b: 'a>(
         column_writer: &'a mut ColumnWriter<'b>,
     ) -> Option<&'a mut ColumnWriterImpl<'b, Self>>
@@ -1152,12 +1167,18 @@ pub trait DataType: 'static + Send {
 }
 
 // Workaround bug in specialization
+#[deprecated(
+    since = "54.0.0",
+    note = "Seems like a stray and nobody knows what's it for. Will be removed 
in 55.0.0"
+)]
+#[allow(missing_docs)]
 pub trait SliceAsBytesDataType: DataType
 where
     Self::T: SliceAsBytes,
 {
 }
 
+#[allow(deprecated)]
 impl<T> SliceAsBytesDataType for T
 where
     T: DataType,
@@ -1167,6 +1188,7 @@ where
 
 macro_rules! make_type {
     ($name:ident, $reader_ident: ident, $writer_ident: ident, $native_ty:ty, 
$size:expr) => {
+        #[doc = concat!("Parquet physical type: ", stringify!($name))]
         #[derive(Clone)]
         pub struct $name {}
 
diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs
index a242c9768..bb4d2543c 100644
--- a/parquet/src/errors.rs
+++ b/parquet/src/errors.rs
@@ -42,6 +42,8 @@ pub enum ParquetError {
     /// Arrow error.
     /// Returned when reading into arrow or writing from arrow.
     ArrowError(String),
+    /// Error when the requested column index is more than the
+    /// number of columns in the row group
     IndexOutOfBound(usize, usize),
     /// An external error variant
     External(Box<dyn Error + Send + Sync>),
diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs
index 3dd698e3d..bd31c9142 100644
--- a/parquet/src/file/footer.rs
+++ b/parquet/src/file/footer.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Module for working with Parquet file footers.
+
 use crate::errors::Result;
 use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE};
 
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 30b17b6a2..5a2ccbc02 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -419,6 +419,7 @@ impl From<ParquetMetaData> for ParquetMetaDataBuilder {
     }
 }
 
+/// A key-value pair for [`FileMetaData`].
 pub type KeyValue = crate::format::KeyValue;
 
 /// Reference counted pointer for [`FileMetaData`].
@@ -722,6 +723,7 @@ impl RowGroupMetaDataBuilder {
         self
     }
 
+    /// Sets file offset for this row group.
     pub fn set_file_offset(mut self, value: i64) -> Self {
         self.0.file_offset = Some(value);
         self
@@ -1409,6 +1411,7 @@ impl Default for ColumnIndexBuilder {
 }
 
 impl ColumnIndexBuilder {
+    /// Creates a new column index builder.
     pub fn new() -> Self {
         ColumnIndexBuilder {
             null_pages: Vec::new(),
@@ -1458,6 +1461,7 @@ impl ColumnIndexBuilder {
         }
     }
 
+    /// Set the boundary order of the column index
     pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
         self.boundary_order = boundary_order;
     }
@@ -1506,6 +1510,7 @@ impl Default for OffsetIndexBuilder {
 }
 
 impl OffsetIndexBuilder {
+    /// Creates a new offset index builder.
     pub fn new() -> Self {
         OffsetIndexBuilder {
             offset_array: Vec::new(),
@@ -1516,17 +1521,20 @@ impl OffsetIndexBuilder {
         }
     }
 
+    /// Append the row count of the next page.
     pub fn append_row_count(&mut self, row_count: i64) {
         let current_page_row_index = self.current_first_row_index;
         self.first_row_index_array.push(current_page_row_index);
         self.current_first_row_index += row_count;
     }
 
+    /// Append the offset and size of the next page.
     pub fn append_offset_and_size(&mut self, offset: i64, 
compressed_page_size: i32) {
         self.offset_array.push(offset);
         self.compressed_page_size_array.push(compressed_page_size);
     }
 
+    /// Append the unencoded byte array data bytes of the next page.
     pub fn append_unencoded_byte_array_data_bytes(
         &mut self,
         unencoded_byte_array_data_bytes: Option<i64>,
diff --git a/parquet/src/file/metadata/writer.rs 
b/parquet/src/file/metadata/writer.rs
index 44328c635..69a939e00 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -286,6 +286,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
         Self { buf, metadata }
     }
 
+    /// Write the metadata to the buffer
     pub fn finish(mut self) -> Result<()> {
         let file_metadata = self.metadata.file_metadata();
 
diff --git a/parquet/src/file/page_index/index.rs 
b/parquet/src/file/page_index/index.rs
index 2f30abead..a66509e14 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -50,18 +50,31 @@ pub struct PageIndex<T> {
 }
 
 impl<T> PageIndex<T> {
+    /// Returns the minimum value in the page
+    ///
+    /// It is `None` when all values are null
     pub fn min(&self) -> Option<&T> {
         self.min.as_ref()
     }
+
+    /// Returns the maximum value in the page
+    ///
+    /// It is `None` when all values are null
     pub fn max(&self) -> Option<&T> {
         self.max.as_ref()
     }
+
+    /// Returns the number of null values in the page
     pub fn null_count(&self) -> Option<i64> {
         self.null_count
     }
+
+    /// Returns the repetition level histogram for the page
     pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
         self.repetition_level_histogram.as_ref()
     }
+
+    /// Returns the definition level histogram for the page
     pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
         self.definition_level_histogram.as_ref()
     }
@@ -71,10 +84,16 @@ impl<T> PageIndex<T>
 where
     T: AsBytes,
 {
+    /// Returns the minimum value in the page as bytes
+    ///
+    /// It is `None` when all values are null
     pub fn max_bytes(&self) -> Option<&[u8]> {
         self.max.as_ref().map(|x| x.as_bytes())
     }
 
+    /// Returns the maximum value in the page as bytes
+    ///
+    /// It is `None` when all values are null
     pub fn min_bytes(&self) -> Option<&[u8]> {
         self.min.as_ref().map(|x| x.as_bytes())
     }
@@ -90,13 +109,21 @@ pub enum Index {
     /// will only return pageLocations without min_max index,
     /// `NONE` represents this lack of index information
     NONE,
+    /// Boolean type index
     BOOLEAN(NativeIndex<bool>),
+    /// 32-bit integer type index
     INT32(NativeIndex<i32>),
+    /// 64-bit integer type index
     INT64(NativeIndex<i64>),
+    /// 96-bit integer type (timestamp) index
     INT96(NativeIndex<Int96>),
+    /// 32-bit floating point type index
     FLOAT(NativeIndex<f32>),
+    /// 64-bit floating point type index
     DOUBLE(NativeIndex<f64>),
+    /// Byte array type index
     BYTE_ARRAY(NativeIndex<ByteArray>),
+    /// Fixed length byte array type index
     FIXED_LEN_BYTE_ARRAY(NativeIndex<FixedLenByteArray>),
 }
 
@@ -155,6 +182,7 @@ pub struct NativeIndex<T: ParquetValueType> {
 }
 
 impl<T: ParquetValueType> NativeIndex<T> {
+    /// The physical data type of the column
     pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE;
 
     /// Creates a new [`NativeIndex`]
diff --git a/parquet/src/file/page_index/offset_index.rs 
b/parquet/src/file/page_index/offset_index.rs
index 2ae346414..d48d1b6c0 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -24,7 +24,10 @@ use crate::format::{OffsetIndex, PageLocation};
 /// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY 
columns.
 #[derive(Debug, Clone, PartialEq)]
 pub struct OffsetIndexMetaData {
+    /// Vector of [`PageLocation`] objects, one per page in the chunk.
     pub page_locations: Vec<PageLocation>,
+    /// Optional vector of unencoded page sizes, one per page in the chunk.
+    /// Only defined for BYTE_ARRAY columns.
     pub unencoded_byte_array_data_bytes: Option<Vec<i64>>,
 }
 
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 61f6390c9..efcb63258 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -64,7 +64,9 @@ pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = 
None;
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[allow(non_camel_case_types)]
 pub enum WriterVersion {
+    /// Parquet format version 1.0
     PARQUET_1_0,
+    /// Parquet format version 2.0
     PARQUET_2_0,
 }
 
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index d8a61fafe..400441f0c 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -61,7 +61,7 @@ pub trait Length {
 /// User provided implementations can implement more sophisticated behaviors
 /// such as on-demand buffering or scan sharing.
 pub trait ChunkReader: Length + Send + Sync {
-    /// The concrete type of readers returned by this trait
+    /// The concrete type of reader returned by this trait
     type T: Read;
 
     /// Get a [`Read`] instance starting at the provided file offset
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 50ed06436..2e05b8336 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -81,9 +81,10 @@ pub(crate) mod private {
     gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
 }
 
-// Macro to generate methods create Statistics.
+/// Macro to generate methods to create Statistics.
 macro_rules! statistics_new_func {
     ($func:ident, $vtype:ty, $stat:ident) => {
+        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` 
column type.")]
         pub fn $func(
             min: $vtype,
             max: $vtype,
@@ -244,7 +245,7 @@ pub fn from_thrift(
     })
 }
 
-// Convert Statistics into Thrift definition.
+/// Convert Statistics into Thrift definition.
 pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
     let stats = stats?;
 
@@ -306,13 +307,21 @@ pub fn to_thrift(stats: Option<&Statistics>) -> 
Option<TStatistics> {
 /// [NativeIndex]: crate::file::page_index::index::NativeIndex
 #[derive(Debug, Clone, PartialEq)]
 pub enum Statistics {
+    /// Statistics for Boolean column
     Boolean(ValueStatistics<bool>),
+    /// Statistics for Int32 column
     Int32(ValueStatistics<i32>),
+    /// Statistics for Int64 column
     Int64(ValueStatistics<i64>),
+    /// Statistics for Int96 column
     Int96(ValueStatistics<Int96>),
+    /// Statistics for Float column
     Float(ValueStatistics<f32>),
+    /// Statistics for Double column
     Double(ValueStatistics<f64>),
+    /// Statistics for ByteArray column
     ByteArray(ValueStatistics<ByteArray>),
+    /// Statistics for FixedLenByteArray column
     FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
 }
 
@@ -323,6 +332,7 @@ impl<T: ParquetValueType> From<ValueStatistics<T>> for 
Statistics {
 }
 
 impl Statistics {
+    /// Creates new statistics for a column type
     pub fn new<T: ParquetValueType>(
         min: Option<T>,
         max: Option<T>,
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 7b7bfa19c..afbe1e549 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -322,6 +322,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
         }
     }
 
+    /// Add a [`KeyValue`] to the file writer's metadata
     pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue) {
         self.kv_metadatas.push(kv_metadata);
     }
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index a54d4a427..3b63845e7 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -82,6 +82,7 @@
 //! [Logical Types]: 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
 //! [object_store]: https://docs.rs/object_store/latest/object_store/
 
+#![warn(missing_docs)]
 /// Defines a an item with an experimental public API
 ///
 /// The module will not be documented, and will only be public if the
@@ -117,7 +118,7 @@ pub mod basic;
 /// [parquet.thrift]: 
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
 // see parquet/CONTRIBUTING.md for instructions on regenerating
 // Don't try clippy and format auto generated code
-#[allow(clippy::all)]
+#[allow(clippy::all, missing_docs)]
 #[rustfmt::skip]
 pub mod format;
 
diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs
index 85d96fd65..7a2e268b3 100644
--- a/parquet/src/record/api.rs
+++ b/parquet/src/record/api.rs
@@ -101,6 +101,7 @@ impl Row {
         }
     }
 
+    /// Converts the row into a JSON object.
     #[cfg(any(feature = "json", test))]
     pub fn to_json_value(&self) -> Value {
         Value::Object(
@@ -134,25 +135,45 @@ impl<'a> Iterator for RowColumnIter<'a> {
 
 /// Trait for type-safe convenient access to fields within a Row.
 pub trait RowAccessor {
+    /// Try to get a boolean value at the given index.
     fn get_bool(&self, i: usize) -> Result<bool>;
+    /// Try to get a byte value at the given index.
     fn get_byte(&self, i: usize) -> Result<i8>;
+    /// Try to get a short value at the given index.
     fn get_short(&self, i: usize) -> Result<i16>;
+    /// Try to get a int value at the given index.
     fn get_int(&self, i: usize) -> Result<i32>;
+    /// Try to get a long value at the given index.
     fn get_long(&self, i: usize) -> Result<i64>;
+    /// Try to get a ubyte value at the given index.
     fn get_ubyte(&self, i: usize) -> Result<u8>;
+    /// Try to get a ushort value at the given index.
     fn get_ushort(&self, i: usize) -> Result<u16>;
+    /// Try to get a uint value at the given index.
     fn get_uint(&self, i: usize) -> Result<u32>;
+    /// Try to get a ulong value at the given index.
     fn get_ulong(&self, i: usize) -> Result<u64>;
+    /// Try to get a float16 value at the given index.
     fn get_float16(&self, i: usize) -> Result<f16>;
+    /// Try to get a float value at the given index.
     fn get_float(&self, i: usize) -> Result<f32>;
+    /// Try to get a double value at the given index.
     fn get_double(&self, i: usize) -> Result<f64>;
+    /// Try to get a date value at the given index.
     fn get_timestamp_millis(&self, i: usize) -> Result<i64>;
+    /// Try to get a date value at the given index.
     fn get_timestamp_micros(&self, i: usize) -> Result<i64>;
+    /// Try to get a decimal value at the given index.
     fn get_decimal(&self, i: usize) -> Result<&Decimal>;
+    /// Try to get a string value at the given index.
     fn get_string(&self, i: usize) -> Result<&String>;
+    /// Try to get a bytes value at the given index.
     fn get_bytes(&self, i: usize) -> Result<&ByteArray>;
+    /// Try to get a group value at the given index.
     fn get_group(&self, i: usize) -> Result<&Row>;
+    /// Try to get a list value at the given index.
     fn get_list(&self, i: usize) -> Result<&List>;
+    /// Try to get a map value at the given index.
     fn get_map(&self, i: usize) -> Result<&Map>;
 }
 
@@ -175,6 +196,7 @@ pub trait RowAccessor {
 /// ```
 ///
 pub trait RowFormatter {
+    /// The method to format a field at the given index.
     fn fmt(&self, i: usize) -> &dyn fmt::Display;
 }
 
@@ -295,6 +317,7 @@ impl List {
         self.elements.len()
     }
 
+    /// Get the reference to the elements in this list
     pub fn elements(&self) -> &[Field] {
         self.elements.as_slice()
     }
@@ -309,25 +332,47 @@ pub fn make_list(elements: Vec<Field>) -> List {
 /// Trait for type-safe access of an index for a `List`.
 /// Note that the get_XXX methods do not do bound checking.
 pub trait ListAccessor {
+    /// Try getting a `boolean` value at the given index.
     fn get_bool(&self, i: usize) -> Result<bool>;
+    /// Try getting a `byte` value at the given index.
     fn get_byte(&self, i: usize) -> Result<i8>;
+    /// Try getting an `i16` value at the given index.
     fn get_short(&self, i: usize) -> Result<i16>;
+    /// Try getting an `i32` value at the given index.
     fn get_int(&self, i: usize) -> Result<i32>;
+    /// Try getting an `i64` value at the given index.
     fn get_long(&self, i: usize) -> Result<i64>;
+    /// Try getting a `u8` value at the given index.
     fn get_ubyte(&self, i: usize) -> Result<u8>;
+    /// Try getting a `u16` value at the given index.
     fn get_ushort(&self, i: usize) -> Result<u16>;
+    /// Try getting a `u32` value at the given index.
     fn get_uint(&self, i: usize) -> Result<u32>;
+    /// Try getting a `u64` value at the given index.
     fn get_ulong(&self, i: usize) -> Result<u64>;
+    /// Try getting a `f16` value at the given index.
     fn get_float16(&self, i: usize) -> Result<f16>;
+    /// Try getting a `f32` value at the given index.
     fn get_float(&self, i: usize) -> Result<f32>;
+    /// Try getting a `f64` value at the given index.
     fn get_double(&self, i: usize) -> Result<f64>;
+    /// Try getting a `timestamp` as milliseconds value
+    /// encoded as `i64` at the given index.
     fn get_timestamp_millis(&self, i: usize) -> Result<i64>;
+    /// Try getting a `timestamp` as microseconds value
+    /// encoded as `i64` at the given index.
     fn get_timestamp_micros(&self, i: usize) -> Result<i64>;
+    /// Try getting a `decimal` value at the given index.
     fn get_decimal(&self, i: usize) -> Result<&Decimal>;
+    /// Try getting a `string` value at the given index.
     fn get_string(&self, i: usize) -> Result<&String>;
+    /// Try getting a `bytes` value at the given index.
     fn get_bytes(&self, i: usize) -> Result<&ByteArray>;
+    /// Try getting a `group` value at the given index.
     fn get_group(&self, i: usize) -> Result<&Row>;
+    /// Try getting a `list` value at the given index.
     fn get_list(&self, i: usize) -> Result<&List>;
+    /// Try getting a `map` value at the given index.
     fn get_map(&self, i: usize) -> Result<&Map>;
 }
 
@@ -420,6 +465,7 @@ impl Map {
         self.entries.len()
     }
 
+    /// Get the reference to the key-value pairs in this map
     pub fn entries(&self) -> &[(Field, Field)] {
         self.entries.as_slice()
     }
@@ -433,7 +479,9 @@ pub fn make_map(entries: Vec<(Field, Field)>) -> Map {
 
 /// Trait for type-safe access of an index for a `Map`
 pub trait MapAccessor {
+    /// Get the keys of the map.
     fn get_keys<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
+    /// Get the values of the map.
     fn get_values<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
 }
 
@@ -532,13 +580,13 @@ pub enum Field {
     Int(i32),
     /// Signed integer INT_64.
     Long(i64),
-    // Unsigned integer UINT_8.
+    /// Unsigned integer UINT_8.
     UByte(u8),
-    // Unsigned integer UINT_16.
+    /// Unsigned integer UINT_16.
     UShort(u16),
-    // Unsigned integer UINT_32.
+    /// Unsigned integer UINT_32.
     UInt(u32),
-    // Unsigned integer UINT_64.
+    /// Unsigned integer UINT_64.
     ULong(u64),
     /// IEEE 16-bit floating point value.
     Float16(f16),
@@ -717,6 +765,7 @@ impl Field {
         Ok(field)
     }
 
+    /// Converts the Parquet field into a JSON [`Value`].
     #[cfg(any(feature = "json", test))]
     pub fn to_json_value(&self) -> Value {
         use base64::prelude::BASE64_STANDARD;
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index cc29658d9..57469ee9c 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -304,18 +304,18 @@ impl TreeBuilder {
 
 /// Reader tree for record assembly
 pub enum Reader {
-    // Primitive reader with type information and triplet iterator
+    /// Primitive reader with type information and triplet iterator
     PrimitiveReader(TypePtr, Box<TripletIter>),
-    // Optional reader with definition level of a parent and a reader
+    /// Optional reader with definition level of a parent and a reader
     OptionReader(i16, Box<Reader>),
-    // Group (struct) reader with type information, definition level and list 
of child
-    // readers. When it represents message type, type information is None
+    /// Group (struct) reader with type information, definition level and list 
of child
+    /// readers. When it represents message type, type information is None
     GroupReader(Option<TypePtr>, i16, Vec<Reader>),
-    // Reader for repeated values, e.g. lists, contains type information, 
definition
-    // level, repetition level and a child reader
+    /// Reader for repeated values, e.g. lists, contains type information, 
definition
+    /// level, repetition level and a child reader
     RepeatedReader(TypePtr, i16, i16, Box<Reader>),
-    // Reader of key-value pairs, e.g. maps, contains type information, 
definition
-    // level, repetition level, child reader for keys and child reader for 
values
+    /// Reader of key-value pairs, e.g. maps, contains type information, 
definition
+    /// level, repetition level, child reader for keys and child reader for 
values
     KeyValueReader(TypePtr, i16, i16, Box<Reader>, Box<Reader>),
 }
 
diff --git a/parquet/src/record/record_reader.rs 
b/parquet/src/record/record_reader.rs
index cfaf14a3d..75ca4e3e3 100644
--- a/parquet/src/record/record_reader.rs
+++ b/parquet/src/record/record_reader.rs
@@ -18,11 +18,12 @@
 use super::super::errors::ParquetError;
 use super::super::file::reader::RowGroupReader;
 
-/// Read up to `max_records` records from `row_group_reader` into `self`.
+/// Read up to `num_records` records from `row_group_reader` into `self`.
 ///
 /// The type parameter `T` is used to work around the rust orphan rule
 /// when implementing on types such as `Vec<T>`.
 pub trait RecordReader<T> {
+    /// Read up to `num_records` records from `row_group_reader` into `self`.
     fn read_from_row_group(
         &mut self,
         row_group_reader: &mut dyn RowGroupReader,
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 2665f28fe..39d2fa28c 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -45,15 +45,24 @@ pub type ColumnDescPtr = Arc<ColumnDescriptor>;
 /// repetition is `None`.
 #[derive(Clone, Debug, PartialEq)]
 pub enum Type {
+    /// Represents a primitive leaf field.
     PrimitiveType {
+        /// Basic information about the type.
         basic_info: BasicTypeInfo,
+        /// Physical type of this primitive type.
         physical_type: PhysicalType,
+        /// Length of this type.
         type_length: i32,
+        /// Scale of this type.
         scale: i32,
+        /// Precision of this type.
         precision: i32,
     },
+    /// Represents a group of fields (similar to struct).
     GroupType {
+        /// Basic information about the type.
         basic_info: BasicTypeInfo,
+        /// Fields of this group type.
         fields: Vec<TypePtr>,
     },
 }
@@ -745,6 +754,7 @@ impl ColumnPath {
         self.parts.append(&mut tail);
     }
 
+    /// Returns a slice of path components.
     pub fn parts(&self) -> &[String] {
         &self.parts
     }
@@ -1033,6 +1043,7 @@ impl SchemaDescriptor {
         self.schema.as_ref()
     }
 
+    /// Returns schema as [`TypePtr`] for cheap cloning.
     pub fn root_schema_ptr(&self) -> TypePtr {
         self.schema.clone()
     }
diff --git a/parquet/src/schema/visitor.rs b/parquet/src/schema/visitor.rs
index 35fde11f1..7a10d3a5f 100644
--- a/parquet/src/schema/visitor.rs
+++ b/parquet/src/schema/visitor.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Utilities to traverse against various parquet type.
+
 use crate::basic::{ConvertedType, Repetition};
 use crate::errors::ParquetError::General;
 use crate::errors::Result;
diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs
index abb2ac13c..5be025f95 100644
--- a/parquet/src/thrift.rs
+++ b/parquet/src/thrift.rs
@@ -27,7 +27,9 @@ use thrift::protocol::{
 ///
 /// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of 
trait objects
 pub trait TSerializable: Sized {
+    /// Reads the struct from the input Thrift protocol
     fn read_from_in_protocol<T: TInputProtocol>(i_prot: &mut T) -> 
thrift::Result<Self>;
+    /// Writes the struct to the output Thrift protocol
     fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()>;
 }

(arrow-rs) branch master updated: chore: add docs, part of #37 (#6496)

Reply via email to