This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-format.git
The following commit(s) were added to refs/heads/master by this push:
new be0478a PARQUET-2480: Clarify what "page index" means in
Parquet.thrift (#245)
be0478a is described below
commit be0478a7f7d6b0cc12503983233c85f8ae0a5fa7
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed May 22 02:03:32 2024 -0400
PARQUET-2480: Clarify what "page index" means in Parquet.thrift (#245)
---
PageIndex.md | 10 ++++++----
src/main/thrift/parquet.thrift | 22 +++++++++++++++++-----
2 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/PageIndex.md b/PageIndex.md
index f4a8f64..a371c42 100644
--- a/PageIndex.md
+++ b/PageIndex.md
@@ -17,11 +17,13 @@
- under the License.
-->
-# ColumnIndex Layout to Support Page Skipping
+# Parquet page index: Layout to Support Page Skipping
-This document describes the format for column index pages in the Parquet
-footer. These pages contain statistics for DataPages and can be used to skip
-pages when scanning data in ordered and unordered columns.
+In Parquet, a *page index* is optional metadata for a
+ColumnChunk, containing statistics for DataPages that can be used
+to skip those pages when scanning in ordered and unordered columns.
+The page index is stored using the OffsetIndex and ColumnIndex structures,
+defined in [`parquet.thrift`](src/main/thrift/parquet.thrift)
## Problem Statement
In previous versions of the format, Statistics are stored for ColumnChunks in
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 27d4043..c928ad6 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -738,10 +738,10 @@ struct PageHeader {
}
/**
- * Wrapper struct to specify sort order
+ * Sort order within a RowGroup of a leaf column
*/
struct SortingColumn {
- /** The column index (in this row group) **/
+ /** The ordinal position of the column (in this row group) **/
1: required i32 column_idx
/** If true, indicates this column is sorted in descending order. **/
@@ -1001,6 +1001,13 @@ struct PageLocation {
3: required i64 first_row_index
}
+/**
+ * Optional offsets for each data page in a ColumnChunk.
+ *
+ * Forms part of the page index, along with ColumnIndex.
+ *
+ * OffsetIndex may be present even if ColumnIndex is not.
+ */
struct OffsetIndex {
/**
* PageLocations, ordered by increasing PageLocation.offset. It is required
@@ -1017,8 +1024,14 @@ struct OffsetIndex {
}
/**
- * Description for ColumnIndex.
- * Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]
+ * Optional statistics for each data page in a ColumnChunk.
+ *
+ * Forms part the page index, along with OffsetIndex.
+ *
+ * If this structure is present, OffsetIndex must also be present.
+ *
+ * For each field in this structure, <field>[i] refers to the page at
+ * OffsetIndex.page_locations[i]
*/
struct ColumnIndex {
/**
@@ -1071,7 +1084,6 @@ struct ColumnIndex {
* Same as repetition_level_histograms except for definitions levels.
**/
7: optional list<i64> definition_level_histograms;
-
}
struct AesGcmV1 {