This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c6b3eaa9637 Refine parquet documentation on types and metadata (#5786)
c6b3eaa9637 is described below
commit c6b3eaa963742b4b490e1dce2c0843e9a1e8f80a
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon May 20 08:50:08 2024 -0400
Refine parquet documentation on types and metadata (#5786)
* Refine parquet documentation on types and metadata
* Update regen.sh and thrift.rs
* Clarify page index encompasses offset index and column index
* revert unexpected diff
---
parquet/regen.sh | 6 +++-
parquet/src/file/metadata.rs | 64 ++++++++++++++++++++++++------------
parquet/src/file/mod.rs | 11 ++++---
parquet/src/file/page_index/index.rs | 9 ++---
parquet/src/file/statistics.rs | 1 +
parquet/src/format.rs | 3 +-
parquet/src/lib.rs | 23 ++++++++-----
parquet/src/schema/mod.rs | 14 ++++++++
parquet/src/schema/types.rs | 52 ++++++++++++++++++-----------
9 files changed, 123 insertions(+), 60 deletions(-)
diff --git a/parquet/regen.sh b/parquet/regen.sh
index f2d8158765c..d1b82108a01 100755
--- a/parquet/regen.sh
+++ b/parquet/regen.sh
@@ -21,7 +21,10 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2
SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"
-docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
+COMMENT='//! See [`crate::file`] for easier to use APIs.'
+
+# Note: add argument --platform=linux/amd64 to run on mac
+docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
pacman -Sy --noconfirm wget thrift && \
wget
https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift
-O /tmp/parquet.thrift && \
thrift --gen rs /tmp/parquet.thrift && \
@@ -35,5 +38,6 @@ docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c
"\
sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn
read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
echo 'Rewriting return value expectations' && \
sed -i 's/Ok(ret.expect(\"return value should have been
constructed\"))/ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
\"return value should have been constructed\")))/g' parquet.rs && \
+ sed -i '1i${COMMENT}' parquet.rs && \
mv parquet.rs /thrift/src/format.rs
"
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index c9232d83e80..853d5ffec8b 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -15,23 +15,20 @@
// specific language governing permissions and limitations
// under the License.
-//! Contains information about available Parquet metadata.
+//! Parquet metadata structures
//!
-//! The hierarchy of metadata is as follows:
+//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
+//! file footer.
//!
-//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains
-//! [`FileMetaData`](struct.FileMetaData.html) and zero or more
-//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group.
+//! * [`FileMetaData`]: File level metadata such as schema, row counts and
+//! version.
//!
-//! [`FileMetaData`](struct.FileMetaData.html) includes file version,
application specific
-//! metadata.
+//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
+//! location and number of rows, and column chunks.
//!
-//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains
information about row
-//! group and one or more
[`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for
-//! each column chunk.
-//!
-//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information
about column
-//! chunk (primitive leaf column), including encoding/compression, number of
values, etc.
+//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
+//! within a Row Group including encoding and compression information,
+//! number of values, statistics, etc.
use std::ops::Range;
use std::sync::Arc;
@@ -61,7 +58,7 @@ use crate::schema::types::{
/// column in the third row group of the parquet file.
pub type ParquetColumnIndex = Vec<Vec<Index>>;
-/// [`PageLocation`] for each datapage of each row group of each column.
+/// [`PageLocation`] for each data page of each row group of each column.
///
/// `offset_index[row_group_number][column_number][page_number]` holds
/// the [`PageLocation`] corresponding to page `page_number` of column
@@ -72,14 +69,30 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
/// parquet file.
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
-/// Global Parquet metadata.
+/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`].
+///
+/// This structure is stored in the footer of Parquet files, in the format
+/// defined by [`parquet.thrift`]. It contains:
+///
+/// * File level metadata: [`FileMetaData`]
+/// * Row Group level metadata: [`RowGroupMetaData`]
+/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and
[`ParquetOffsetIndex`]
+///
+/// [`parquet.thrift`]:
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
+///
+/// This structure is read by the various readers in this crate or can be read
+/// directly from a file using the [`parse_metadata`] function.
+///
+/// [`parse_metadata`]: crate::file::footer::parse_metadata
#[derive(Debug, Clone)]
pub struct ParquetMetaData {
+ /// File level metadata
file_metadata: FileMetaData,
+ /// Row group metadata
row_groups: Vec<RowGroupMetaData>,
- /// Page index for all pages in each column chunk
+ /// Page level index for each page in each column chunk
column_index: Option<ParquetColumnIndex>,
- /// Offset index for all pages in each column chunk
+ /// Offset index for all each page in each column chunk
offset_index: Option<ParquetOffsetIndex>,
}
@@ -172,7 +185,9 @@ pub type KeyValue = crate::format::KeyValue;
/// Reference counted pointer for [`FileMetaData`].
pub type FileMetaDataPtr = Arc<FileMetaData>;
-/// Metadata for a Parquet file.
+/// File level metadata for a Parquet file.
+///
+/// Includes the version of the file, metadata, number of rows, schema, and
column orders
#[derive(Debug, Clone)]
pub struct FileMetaData {
version: i32,
@@ -271,7 +286,10 @@ impl FileMetaData {
/// Reference counted pointer for [`RowGroupMetaData`].
pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
-/// Metadata for a row group.
+/// Metadata for a row group
+///
+/// Includes [`ColumnChunkMetaData`] for each column in the row group, the
number of rows
+/// the total byte size of the row group, and the [`SchemaDescriptor`] for the
row group.
#[derive(Debug, Clone, PartialEq)]
pub struct RowGroupMetaData {
columns: Vec<ColumnChunkMetaData>,
@@ -279,8 +297,9 @@ pub struct RowGroupMetaData {
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
schema_descr: SchemaDescPtr,
- // We can't infer from file offset of first column since there may empty
columns in row group.
+ /// We can't infer from file offset of first column since there may empty
columns in row group.
file_offset: Option<i64>,
+ /// Ordinal position of this row group in file
ordinal: Option<i16>,
}
@@ -335,7 +354,10 @@ impl RowGroupMetaData {
self.schema_descr.clone()
}
- /// Returns ordinal of this row group in file
+ /// Returns ordinal position of this row group in file.
+ ///
+ /// For example if this is the first row group in the file, this will
return 0.
+ /// If this is the second row group in the file, this will return 1.
#[inline(always)]
pub fn ordinal(&self) -> Option<i16> {
self.ordinal
diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs
index 6589d2efaf8..a1df33633fc 100644
--- a/parquet/src/file/mod.rs
+++ b/parquet/src/file/mod.rs
@@ -19,10 +19,13 @@
//!
//! Provides access to file and row group readers and writers, record API,
metadata, etc.
//!
-//! See
[`serialized_reader::SerializedFileReader`](serialized_reader/struct.SerializedFileReader.html)
or
-//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html)
for a
-//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for
file
-//! metadata, and [`statistics`](statistics/index.html) for working with
statistics.
+//! # See Also:
+//! * [`SerializedFileReader`] and [`SerializedFileWriter`] for reading /
writing parquet
+//! * [`metadata`]: for working with metadata such as schema
+//! * [`statistics`]: for working with statistics in metadata
+//!
+//! [`SerializedFileReader`]: serialized_reader::SerializedFileReader
+//! [`SerializedFileWriter`]: writer::SerializedFileWriter
//!
//! # Example of writing a new file
//!
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index f3a09046a63..ab342d52b7f 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -57,10 +57,11 @@ impl<T> PageIndex<T> {
#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
-/// Typed statistics for a data page in a column chunk. This structure
-/// is obtained from decoding the [ColumnIndex] in the parquet file
-/// and can be used to skip decoding pages while reading the file
-/// data.
+/// Typed statistics for a data page in a column chunk.
+///
+/// This structure is part of the "Page Index" and is optionally part of
+/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
+/// while reading the file data.
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index d24b91741be..7d704cc138f 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -20,6 +20,7 @@
//! Though some common methods are available on enum, use pattern match to
extract
//! actual min and max values from statistics, see below:
//!
+//! # Examples
//! ```rust
//! use parquet::file::statistics::Statistics;
//!
diff --git a/parquet/src/format.rs b/parquet/src/format.rs
index 9f4ddfe8285..b210d6ec1b7 100644
--- a/parquet/src/format.rs
+++ b/parquet/src/format.rs
@@ -1,4 +1,5 @@
-// Autogenerated by Thrift Compiler (0.19.0)
+//! See [`crate::file`] for easier to use APIs.
+// Autogenerated by Thrift Compiler (0.20.0)
// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#![allow(dead_code)]
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 1166703bb15..f8342453fec 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -28,25 +28,30 @@
//! # Format Overview
//!
//! Parquet is a columnar format, which means that unlike row formats like
[CSV], values are
-//! iterated along columns instead of rows. Parquet is similar in spirit to
[Arrow], with Parquet
-//! focusing on storage efficiency whereas Arrow prioritizes compute
efficiency.
+//! iterated along columns instead of rows. Parquet is similar in spirit to
[Arrow], but
+//! focuses on storage efficiency whereas Arrow prioritizes compute efficiency.
//!
//! Parquet files are partitioned for scalability. Each file contains metadata,
//! along with zero or more "row groups", each row group containing one or
//! more columns. The APIs in this crate reflect this structure.
//!
-//! Parquet distinguishes between "logical" and "physical" data types.
-//! For instance, strings (logical type) are stored as byte arrays (physical
type).
-//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
-//! are stored as integers (physical type). This crate exposes both kinds of
types.
+//! Data in Parquet files is strongly typed and differentiates between logical
+//! and physical types (see [`schema`]). In addition, Parquet files may contain
+//! other metadata, such as statistics, which can be used to optimize reading
+//! (see [`file::metadata`]).
+//! For more details about the Parquet format itself, see the [Parquet spec]
//!
-//! For more details about the Parquet format, see the
-//! [Parquet
spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+//! [Parquet spec]:
https://github.com/apache/parquet-format/blob/master/README.md#file-format
//!
//! # APIs
//!
//! This crate exposes a number of APIs for different use-cases.
//!
+//! ## Metadata and Schema
+//!
+//! The [`schema`] module provides APIs to work with Parquet schemas. The
+//! [`file::metadata`] module provides APIs to work with Parquet metadata.
+//!
//! ## Read/Write Arrow
//!
//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow
`RecordBatch`.
@@ -64,7 +69,7 @@
//!
//! ## Read/Write Parquet
//!
-//! Workloads needing finer-grained control, or looking to not take a
dependency on arrow,
+//! Workloads needing finer-grained control, or avoid a dependence on arrow,
//! can use the lower-level APIs in [`mod@file`]. These APIs expose the
underlying parquet
//! data model, and therefore require knowledge of the underlying parquet
format,
//! including the details of [Dremel] record shredding and [Logical Types].
Most workloads
diff --git a/parquet/src/schema/mod.rs b/parquet/src/schema/mod.rs
index ead7f1d2c0f..415802c990e 100644
--- a/parquet/src/schema/mod.rs
+++ b/parquet/src/schema/mod.rs
@@ -17,6 +17,20 @@
//! Parquet schema definitions and methods to print and parse schema.
//!
+//! * [`SchemaDescriptor`] describes the data types of the columns stored in a
file
+//! * [`ColumnDescriptor`]: Describes the schema of a single (leaf) column.
+//! * [`ColumnPath`]: Represents the location of a column in the schema (e.g.
a nested field)
+//!
+//! Parquet distinguishes
+//! between "logical" and "physical" data types.
+//! For instance, strings (logical type) are stored as byte arrays (physical
type).
+//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+//! are stored as integers (physical type).
+//!
+//! [`SchemaDescriptor`]: types::SchemaDescriptor
+//! [`ColumnDescriptor`]: types::ColumnDescriptor
+//! [`ColumnPath`]: types::ColumnPath
+//!
//! # Example
//!
//! ```rust
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index dbf6e8dcb3b..a0cbf506f7c 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -37,8 +37,10 @@ pub type SchemaDescPtr = Arc<SchemaDescriptor>;
pub type ColumnDescPtr = Arc<ColumnDescriptor>;
/// Representation of a Parquet type.
+///
/// Used to describe primitive leaf fields and structs, including top-level
schema.
-/// Note that the top-level schema type is represented using `GroupType` whose
+///
+/// Note that the top-level schema is represented using [`Type::GroupType`]
whose
/// repetition is `None`.
#[derive(Clone, Debug, PartialEq)]
pub enum Type {
@@ -662,7 +664,7 @@ impl BasicTypeInfo {
// ----------------------------------------------------------------------
// Parquet descriptor definitions
-/// Represents a path in a nested schema
+/// Represents the location of a column in a Parquet schema
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub struct ColumnPath {
parts: Vec<String>,
@@ -737,21 +739,22 @@ impl AsRef<[String]> for ColumnPath {
}
}
-/// A descriptor for leaf-level primitive columns.
-/// This encapsulates information such as definition and repetition levels and
is used to
+/// Physical type for leaf-level primitive columns.
+///
+/// Also includes the maximum definition and repetition levels required to
/// re-assemble nested data.
#[derive(Debug, PartialEq)]
pub struct ColumnDescriptor {
- // The "leaf" primitive type of this column
+ /// The "leaf" primitive type of this column
primitive_type: TypePtr,
- // The maximum definition level for this column
+ /// The maximum definition level for this column
max_def_level: i16,
- // The maximum repetition level for this column
+ /// The maximum repetition level for this column
max_rep_level: i16,
- // The path of this column. For instance, "a.b.c.d".
+ /// The path of this column. For instance, "a.b.c.d".
path: ColumnPath,
}
@@ -860,24 +863,33 @@ impl ColumnDescriptor {
}
}
-/// A schema descriptor. This encapsulates the top-level schemas for all the
columns,
-/// as well as all descriptors for all the primitive columns.
+/// Schema of a Parquet file.
+///
+/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
+/// each primitive (leaf) column.
#[derive(PartialEq)]
pub struct SchemaDescriptor {
- // The top-level schema (the "message" type).
- // This must be a `GroupType` where each field is a root column type in
the schema.
+ /// The top-level logical schema (the "message" type).
+ ///
+ /// This must be a [`Type::GroupType`] where each field is a root
+ /// column type in the schema.
schema: TypePtr,
- // All the descriptors for primitive columns in this schema, constructed
from
- // `schema` in DFS order.
+ /// The descriptors for the physical type of each leaf column in this
schema
+ ///
+ /// Constructed from `schema` in DFS order.
leaves: Vec<ColumnDescPtr>,
- // Mapping from a leaf column's index to the root column index that it
- // comes from. For instance: the leaf `a.b.c.d` would have a link back to
`a`:
- // -- a <-----+
- // -- -- b |
- // -- -- -- c |
- // -- -- -- -- d
+ /// Mapping from a leaf column's index to the root column index that it
+ /// comes from.
+ ///
+ /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
+ /// ```text
+ /// -- a <-----+
+ /// -- -- b |
+ /// -- -- -- c |
+ /// -- -- -- -- d
+ /// ```
leaf_to_base: Vec<usize>,
}