This is an automated email from the ASF dual-hosted git repository.
gabor pushed a commit to branch production
in repository https://gitbox.apache.org/repos/asf/parquet-site.git
The following commit(s) were added to refs/heads/production by this push:
new 5ab1cc6 Update Metadata Diagrams (#106)
5ab1cc6 is described below
commit 5ab1cc62cee7214f3c1f17e6b8a3a22e721eb6de
Author: Kenny Daniel <[email protected]>
AuthorDate: Wed Mar 5 23:19:07 2025 -0800
Update Metadata Diagrams (#106)
---
README.md | 16 +++
content/en/docs/File Format/metadata.md | 21 ++--
package.json | 1 +
static/images/FileFormat.gif | Bin 47208 -> 0 bytes
static/images/FileLayoutBloomFilter2.png | Bin
static/images/FileLayoutEncryptionEF.png | Bin
static/images/FileLayoutEncryptionPF.png | Bin
static/images/FileMetaData.mermaid | 173 +++++++++++++++++++++++++++++++
static/images/FileMetaData.svg | 1 +
static/images/PageHeader.mermaid | 62 +++++++++++
static/images/PageHeader.svg | 1 +
11 files changed, 268 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index 588e202..9656e06 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,22 @@ To preview this website site locally, run the following in
the root of the direc
hugo server
```
+### Building metadata diagrams
+
+To build the metadata svg diagrams, you need mermaid.js installed. You can
install it using npm:
+
+```
+npm install -D @mermaid-js/mermaid-cli
+```
+
+Then you can build the diagrams using the following command:
+
+```
+cd static/images
+npx mmdc -i FileMetaData.mermaid -o FileMetaData.svg
+npx mmdc -i PageHeader.mermaid -o PageHeader.svg
+```
+
## Building and Running in Docker
If you don't want to install `hugo` and its dependencies on your local machine,
diff --git a/content/en/docs/File Format/metadata.md b/content/en/docs/File
Format/metadata.md
index f86b160..0e3640d 100644
--- a/content/en/docs/File Format/metadata.md
+++ b/content/en/docs/File Format/metadata.md
@@ -4,16 +4,23 @@ linkTitle: "Metadata"
weight: 5
---
There are two types of metadata: file metadata, and page header metadata.
-In the diagram below, file metadata is described by the `FileMetaData`
-structure. This file metadata provides offset and size information useful
-when navigating the Parquet file. Page header metadata (`PageHeader` and
-children in the diagram) is stored in-line with the page data, and is
-used in the reading and decoding of said data.
-
All thrift structures are serialized using the TCompactProtocol. The full
definition of these structures is given in the Parquet
[Thrift
definition](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift).
+## File metadata
+
+In the diagram below, file metadata is described by the `FileMetaData`
+structure. This file metadata provides offset and size information useful
+when navigating the Parquet file.
+
+
+
+## Page header
+
+Page header metadata (`PageHeader` and children in the diagram) is stored
+in-line with the page data, and is used in the reading and decoding of data.
+
+
-
diff --git a/package.json b/package.json
index 9d6acd2..41b70d3 100644
--- a/package.json
+++ b/package.json
@@ -17,6 +17,7 @@
},
"homepage": "https://github.com/apache/parquet-site#readme",
"devDependencies": {
+ "@mermaid-js/mermaid-cli": "^11.4.2",
"autoprefixer": "^10.4.18",
"postcss": "^8.4.35",
"postcss-cli": "^11.0.0"
diff --git a/static/images/FileFormat.gif b/static/images/FileFormat.gif
deleted file mode 100644
index f4cf5e1..0000000
Binary files a/static/images/FileFormat.gif and /dev/null differ
diff --git a/static/images/FileLayoutBloomFilter2.png
b/static/images/FileLayoutBloomFilter2.png
old mode 100755
new mode 100644
diff --git a/static/images/FileLayoutEncryptionEF.png
b/static/images/FileLayoutEncryptionEF.png
old mode 100755
new mode 100644
diff --git a/static/images/FileLayoutEncryptionPF.png
b/static/images/FileLayoutEncryptionPF.png
old mode 100755
new mode 100644
diff --git a/static/images/FileMetaData.mermaid
b/static/images/FileMetaData.mermaid
new file mode 100644
index 0000000..be96680
--- /dev/null
+++ b/static/images/FileMetaData.mermaid
@@ -0,0 +1,173 @@
+classDiagram
+ FileMetaData --> SchemaElement
+ FileMetaData --> RowGroup
+
+ RowGroup --> ColumnChunk
+
+ ColumnChunk --> ColumnMetaData
+
+ ColumnMetaData --> Statistics
+ ColumnMetaData --> Type
+ ColumnMetaData --> Encoding
+ ColumnMetaData --> CompressionCodec
+
+ SchemaElement --> LogicalTypes
+ SchemaElement --> Type
+ SchemaElement --> ConvertedType
+
+ class FileMetaData {
+ int32 version
+ list~SchemaElement~ schema
+ int64 num_rows
+ list~RowGroup~ row_groups
+ list~KeyValue~ key_value_metadata
+ string created_by
+ list~ColumnOrder~ column_orders
+ EncryptionAlgorithm encryption_algorithm
+ binary footer_signing_key_metadata
+ }
+
+ class SchemaElement {
+ Type type
+ int32 type_length
+ FieldRepetitionType repetition_type
+ string name
+ int32 num_children
+ ConvertedType converted_type
+ int32 scale
+ int32 precision
+ int32 field_id
+ LogicalType logicalType
+ }
+
+ class Type {
+ BOOLEAN
+ INT32
+ INT64
+ INT96
+ FLOAT
+ DOUBLE
+ BYTE_ARRAY
+ FIXED_LEN_BYTE_ARRAY
+ }
+
+ class LogicalTypes {
+ StringType
+ MapType
+ ListType
+ EnumType
+ DecimalType
+ DateType
+ TimeType
+ TimestampType
+ IntType
+ NullType
+ JsonType
+ BsonType
+ UUIDType
+ Float16Type
+ VariantType
+ GeometryType
+ GeographyType
+ }
+
+ class ConvertedType {
+ UTF8
+ MAP
+ MAP_KEY_VALUE
+ LIST
+ ENUM
+ DECIMAL
+ DATE
+ TIME_MILLIS
+ TIME_MICROS
+ TIMESTAMP_MILLIS
+ TIMESTAMP_MICROS
+ UINT_8
+ UINT_16
+ UINT_32
+ UINT_64
+ INT_8
+ INT_16
+ INT_32
+ INT_64
+ JSON
+ BSON
+ INTERVAL
+ }
+
+ class Encoding {
+ PLAIN
+ PLAIN_DICTIONARY
+ RLE
+ BIT_PACKED
+ DELTA_BINARY_PACKED
+ DELTA_LENGTH_BYTE_ARRAY
+ DELTA_BYTE_ARRAY
+ RLE_DICTIONARY
+ BYTE_STREAM_SPLIT
+ }
+
+ class CompressionCodec {
+ UNCOMPRESSED
+ SNAPPY
+ GZIP
+ LZO
+ BROTLI
+ LZ4
+ ZSTD
+ LZ4_RAW
+ }
+
+ class RowGroup {
+ list~ColumnChunk~ columns
+ int64 total_byte_size
+ int64 num_rows
+ list~SortingColumn~ sorting_columns
+ int64 file_offset
+ int64 total_compressed_size
+ int16 ordinal
+ }
+
+ class ColumnChunk {
+ string file_path
+ int64 file_offset
+ ColumnMetaData meta_data
+ int64 offset_index_offset
+ int32 offset_index_length
+ int64 column_index_offset
+ int32 column_index_length
+ ColumnCryptoMetaData crypto_metadata
+ binary encrypted_column_metadata
+ }
+
+ class ColumnMetaData {
+ Type type
+ list~Encoding~ encodings
+ list~string~ path_in_schema
+ CompressionCodec codec
+ int64 num_values
+ int64 total_uncompressed_size
+ int64 total_compressed_size
+ list~KeyValue~ key_value_metadata
+ int64 data_page_offset
+ int64 index_page_offset
+ int64 dictionary_page_offset
+ Statistics statistics
+ list~PageEncodingStats~ encoding_stats
+ int64 bloom_filter_offset
+ int32 bloom_filter_length
+ SizeStatistics size_statistics
+ GeospatialStatistics geospatial_statistics
+ }
+
+ class Statistics {
+ binary max
+ binary min
+ int64 null_count
+ int64 distinct_count
+ binary max_value
+ binary min_value
+ bool is_max_value_exact
+ bool is_min_value_exact
+ }
diff --git a/static/images/FileMetaData.svg b/static/images/FileMetaData.svg
new file mode 100644
index 0000000..1b5ac94
--- /dev/null
+++ b/static/images/FileMetaData.svg
@@ -0,0 +1 @@
+<svg aria-roledescription="class" role="graphics-document document" viewBox="0
0 1592.3203125 2232" style="max-width: 1592.32px; background-color: white;"
class="classDiagram" xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg" width="100%"
id="my-svg"><style>#my-svg{font-family:"trebuchet
ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#my-svg
.error-icon{fill:#552222;}#my-svg
.error-text{fill:#552222;stroke:#552222;}#my-svg .edge-thickness-normal{stroke
[...]
\ No newline at end of file
diff --git a/static/images/PageHeader.mermaid b/static/images/PageHeader.mermaid
new file mode 100644
index 0000000..854eebd
--- /dev/null
+++ b/static/images/PageHeader.mermaid
@@ -0,0 +1,62 @@
+classDiagram
+ PageHeader --> PageType
+ PageHeader --> DictionaryPageHeader
+ PageHeader --> DataPageHeader
+ PageHeader --> DataPageHeaderV2
+
+ DataPageHeader --> Statistics
+ DataPageHeaderV2 --> Statistics
+
+ class PageHeader {
+ PageType type
+ int32 uncompressed_page_size
+ int32 compressed_page_size
+ int32 crc
+ DataPageHeader data_page_header
+ IndexPageHeader index_page_header
+ DictionaryPageHeader dictionary_page_header
+ DataPageHeaderV2 data_page_header_v2
+ }
+
+ class PageType {
+ DATA_PAGE = 0
+ INDEX_PAGE = 1
+ DICTIONARY_PAGE = 2
+ DATA_PAGE_V2 = 3
+ }
+
+ class DataPageHeader {
+ int32 num_values
+ Encoding encoding
+ Encoding definition_level_encoding
+ Encoding repetition_level_encoding
+ Statistics statistics
+ }
+
+ class DictionaryPageHeader {
+ int32 num_values
+ Encoding encoding
+ bool is_sorted
+ }
+
+ class DataPageHeaderV2 {
+ int32 num_values
+ int32 num_nulls
+ int32 num_rows
+ Encoding encoding
+ int32 definition_levels_byte_length
+ int32 repetition_levels_byte_length
+ bool is_compressed
+ Statistics statistics
+ }
+
+ class Statistics {
+ binary max
+ binary min
+ int64 null_count
+ int64 distinct_count
+ binary max_value
+ binary min_value
+ bool is_max_value_exact
+ bool is_min_value_exact
+ }
diff --git a/static/images/PageHeader.svg b/static/images/PageHeader.svg
new file mode 100644
index 0000000..e0dad4c
--- /dev/null
+++ b/static/images/PageHeader.svg
@@ -0,0 +1 @@
+<svg aria-roledescription="class" role="graphics-document document" viewBox="0
0 1313.5 980" style="max-width: 1313.5px; background-color: white;"
class="classDiagram" xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg" width="100%"
id="my-svg"><style>#my-svg{font-family:"trebuchet
ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#my-svg
.error-icon{fill:#552222;}#my-svg
.error-text{fill:#552222;stroke:#552222;}#my-svg
.edge-thickness-normal{stroke-width:1 [...]
\ No newline at end of file