This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-format.git
The following commit(s) were added to refs/heads/master by this push:
new 38c108c PARQUET-2473: Clarify records can not be split across v2
pages or PageIndex (#244)
38c108c is described below
commit 38c108c8ff24a432db40453b2f04493534c1d2cf
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri May 31 08:55:23 2024 -0400
PARQUET-2473: Clarify records can not be split across v2 pages or PageIndex
(#244)
Co-authored-by: Ed Seidl <[email protected]>
---
src/main/thrift/parquet.thrift | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index c928ad6..85e8887 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -578,7 +578,13 @@ enum BoundaryOrder {
/** Data page header */
struct DataPageHeader {
- /** Number of values, including NULLs, in this data page. **/
+ /**
+ * Number of values, including NULLs, in this data page.
+ *
+ * If a OffsetIndex is present, a page must begin at a record
+ * boundary (repetition_level = 0). Otherwise, pages may begin
+ * within a record (repetition_level > 0).
+ **/
1: required i32 num_values
/** Encoding used for this data page **/
@@ -625,7 +631,11 @@ struct DataPageHeaderV2 {
/** Number of NULL values, in this data page.
Number of non-null = num_values - num_nulls which is also the number of
values in the data section **/
2: required i32 num_nulls
- /** Number of rows in this data page. which means pages change on record
boundaries (r = 0) **/
+ /**
+ * Number of rows in this data page. Every page must begin at a
+ * record boundary (repetition_level = 0): records must **not** be
+ * split across page boundaries when using V2 data pages.
+ **/
3: required i32 num_rows
/** Encoding used for data in this page **/
4: required Encoding encoding
@@ -995,8 +1005,9 @@ struct PageLocation {
2: required i32 compressed_page_size
/**
- * Index within the RowGroup of the first row of the page; this means pages
- * change on record boundaries (r = 0).
+ * Index within the RowGroup of the first row of the page. When an
+ * OffsetIndex is present, pages must begin on record boundaries
+ * (repetition_level = 0).
*/
3: required i64 first_row_index
}
@@ -1190,4 +1201,3 @@ struct FileCryptoMetaData {
* and (possibly) columns **/
2: optional binary key_metadata
}
-