emkornfield commented on code in PR #544:
URL: https://github.com/apache/parquet-format/pull/544#discussion_r2677194471
##########
src/main/flatbuf/parquet3.fbs:
##########
@@ -0,0 +1,224 @@
+namespace parquet.format3;
+
+// Optimization notes
+// 1. Statistics are stored in integral types if their size is fixed,
otherwise prefix + suffix
+// 2. ColumnMetaData.encoding_stats are removed, they are replaced with
+// ColumnMetaData.is_fully_dict_encoded.
+// 3. RowGroups are limited to 2GB in size, so we can use int for sizes.
+// 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the
row group, so we can
+// use int for offsets.
+// 5. Remove ordinal.
+// 6. Restrict RowGroups to 2^31-1 rows.
+// 7. Remove offset/column indexes, they are small and just their offsets are
of similar size.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Physical types.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum Type : byte {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7,
+}
+
+enum FieldRepetitionType : byte {
+ REQUIRED = 0,
+ OPTIONAL = 1,
+ REPEATED = 2,
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Encodings.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Note: Match the thrift enum values so that we can cast between them.
+enum Encoding : byte {
+ PLAIN = 0,
+ // GROUP_VAR_INT = 1,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ // BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9,
+}
+
+// Note: Match the thrift enum values so that we can cast between them.
+enum CompressionCodec : byte {
+ UNCOMPRESSED = 0,
+ SNAPPY = 1,
+ GZIP = 2,
+ LZO = 3,
+ BROTLI = 4,
+ // LZ4 = 5,
+ ZSTD = 6,
+ LZ4_RAW = 7,
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Logical types.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+table Empty {}
+table DecimalOpts {
+ precision: int;
+ scale: int;
+}
+enum TimeUnit : byte {
+ MS = 0,
+ US = 1,
+ NS = 2,
+}
+table TimeOpts {
+ is_adjusted_to_utc: bool;
+ unit: TimeUnit;
+}
+table IntOpts {
+ bit_width: byte = 8;
+ is_signed: bool;
+}
+table GeometryType {
+ crs: string;
+}
+enum EdgeInterpolationAlgorithm : byte {
+ SPHERICAL = 0,
+ VINCENTY = 1,
+ THOMAS = 2,
+ ANDOYER = 3,
+ KARNEY = 4,
+}
+table GeographyType {
+ crs: string;
+ algorithm: EdgeInterpolationAlgorithm;
+}
+union LogicalType {
+ StringType:Empty,
+ MapType:Empty,
+ ListType:Empty,
+ EnumType:Empty,
+ DecimalType:DecimalOpts,
+ DateType:Empty,
+ TimeType:TimeOpts,
+ TimestampType:TimeOpts,
+ IntType:IntOpts,
+ NullType:Empty,
+ JsonType:Empty,
+ BsonType:Empty,
+ UUIDType:Empty,
+ Float16Type:Empty,
+ VariantType:Empty,
+ GeometryType:GeometryType,
+ GeographyType:GeographyType,
+}
+
+table Statistics {
+ null_count: int = null;
+ // Store min/max values fixed sized entities depending on the physical type.
If len is present
+ // then the min/max value is present.
+ //
+ // - BOOLEAN: none
+ // - INT32/FLOAT: lo4 (little-endian)
+ // - INT64/DOUBLE: lo8 (little-endian)
+ // - INT96: lo4+lo8 (little-endian)
+ // - FIXED_LEN_BYTE_ARRAY:
+ // - BYTE_ARRAY:
+ // prefix: the longest common prefix of min/max
+ // lo8+hi8 zero padded 16 bytes (big-endian) of the suffix
+ // len: the length for the suffix of the value after removing the prefix.
If > 16 then the
+ // value is inexact
+ min_lo4: uint;
+ min_lo8: ulong;
+ min_hi8: ulong;
+ min_len: byte = null;
+ max_lo4: uint;
+ max_lo8: ulong;
+ max_hi8: ulong;
+ max_len: byte = null;
+ prefix: string;
+}
+
+union ColumnOrder {
+ TypeDefinedOrder:Empty,
+}
+
+table SchemaElement {
+ name: string;
+ type: Type = null;
+ repetition_type: FieldRepetitionType;
+ logical_type: LogicalType;
+ type_length: int = null;
+ num_children: int = 0;
+ field_id: int = null;
+ column_order: ColumnOrder; // only present for leaf nodes
+}
+
+enum PageType : byte {
+ DATA_PAGE = 0,
+ INDEX_PAGE = 1,
+ DICTIONARY_PAGE = 2,
+ DATA_PAGE_V2 = 3,
+}
+
+table KV {
+ key: string;
+ val: string;
+}
+
+table ColumnMetadata {
+ codec: CompressionCodec;
+ num_values: long = null; // only present if not equal to rg.num_rows
+ total_uncompressed_size: long;
+ total_compressed_size: long;
Review Comment:
It would be nice to keep total unencoded size here which I think is
generally useful? But I suppose it can be added after?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]