Repository: parquet-cpp Updated Branches: refs/heads/master 921fd3028 -> b1c85caf9
PARQUET-848: Build Thrift bits as part of main parquet_objlib component Author: Wes McKinney <[email protected]> Closes #232 from wesm/remove-libparquet_thrift and squashes the following commits: d72546b [Wes McKinney] Remove parquet/thrift subdirectory and separate parquet_thrift library component Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/b1c85caf Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/b1c85caf Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/b1c85caf Branch: refs/heads/master Commit: b1c85caf92678e46abd70037e639a922da47c617 Parents: 921fd30 Author: Wes McKinney <[email protected]> Authored: Sun Jan 29 12:44:38 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Sun Jan 29 12:44:38 2017 -0500 ---------------------------------------------------------------------- CMakeLists.txt | 44 +- src/parquet/.gitignore | 2 + src/parquet/file/file-deserialize-test.cc | 4 +- src/parquet/file/metadata.cc | 2 +- src/parquet/file/reader-internal.cc | 2 +- src/parquet/file/reader-internal.h | 2 +- src/parquet/file/writer-internal.cc | 2 +- src/parquet/file/writer-internal.h | 2 +- src/parquet/parquet.thrift | 574 +++++++++++++++++++++++++ src/parquet/schema-internal.h | 2 +- src/parquet/schema-test.cc | 2 +- src/parquet/schema.cc | 4 +- src/parquet/thrift.h | 147 +++++++ src/parquet/thrift/.gitignore | 2 - src/parquet/thrift/CMakeLists.txt | 49 --- src/parquet/thrift/parquet.thrift | 574 ------------------------- src/parquet/thrift/util.h | 147 ------- 17 files changed, 772 insertions(+), 789 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ff1421..ffda1ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -473,6 +473,35 @@ else() endif() ############################################################ +# Generated Thrift sources + +set(THRIFT_SRCS + src/parquet/parquet_constants.cpp + src/parquet/parquet_types.cpp) + +set_source_files_properties(src/parquet/parquet_types.cpp PROPERTIES + COMPILE_FLAGS -Wno-unused-variable) + +# List of thrift output targets +set(THRIFT_OUTPUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/parquet) +set(THRIFT_OUTPUT_FILES "${THRIFT_OUTPUT_DIR}/parquet_types.cpp") +set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${THRIFT_OUTPUT_DIR}/parquet_types.h") +set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${THRIFT_OUTPUT_DIR}/parquet_constants.cpp") +set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${THRIFT_OUTPUT_DIR}/parquet_constants.h") + +set_source_files_properties(${THRIFT_OUTPUT_FILES} PROPERTIES GENERATED TRUE) + +get_filename_component(ABS_PARQUET_THRIFT src/parquet/parquet.thrift ABSOLUTE) + +add_custom_command( + OUTPUT ${THRIFT_OUTPUT_FILES} + COMMAND ${THRIFT_COMPILER} --gen cpp -out ${THRIFT_OUTPUT_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src/parquet/parquet.thrift + DEPENDS ${ABS_PARQUET_THRIFT} thriftstatic + COMMENT "Running thrift compiler on parquet.thrift" + VERBATIM +) + +############################################################ # Library config set(LIBPARQUET_SRCS @@ -496,6 +525,9 @@ set(LIBPARQUET_SRCS src/parquet/schema.cc + src/parquet/parquet_constants.cpp + src/parquet/parquet_types.cpp + src/parquet/util/cpu-info.cc src/parquet/util/memory.cc ) @@ -504,7 +536,6 @@ set(LIBPARQUET_LINK_LIBS ) set(BUNDLED_STATIC_LIBS - parquet_thrift brotlistatic_dec brotlistatic_enc brotlistatic_common @@ -523,6 +554,12 @@ add_library(parquet_objlib OBJECT ${LIBPARQUET_SRCS} ) +# # Ensure that thrift compilation is done before using its generated headers +# # in parquet code. +add_custom_target(thrift-deps ALL + DEPENDS ${THRIFT_OUTPUT_FILES}) +add_dependencies(parquet_objlib thrift-deps) + # Although we don't link parquet_objlib against anything, we need it to depend # on these libs as we may generate their headers via ExternalProject_Add add_dependencies(parquet_objlib @@ -574,13 +611,8 @@ add_subdirectory(src/parquet/api) add_subdirectory(src/parquet/column) add_subdirectory(src/parquet/encodings) add_subdirectory(src/parquet/file) -add_subdirectory(src/parquet/thrift) add_subdirectory(src/parquet/util) -# Ensure that thrift compilation is done before using its generated headers -# in parquet code. -add_dependencies(parquet_objlib parquet_thrift) - add_subdirectory(benchmarks) add_subdirectory(examples) add_subdirectory(tools) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/.gitignore ---------------------------------------------------------------------- diff --git a/src/parquet/.gitignore b/src/parquet/.gitignore new file mode 100644 index 0000000..0695270 --- /dev/null +++ b/src/parquet/.gitignore @@ -0,0 +1,2 @@ +parquet_constants.* +parquet_types.* \ No newline at end of file http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/file/file-deserialize-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-deserialize-test.cc b/src/parquet/file/file-deserialize-test.cc index 30b3f6d..8e5e868 100644 --- a/src/parquet/file/file-deserialize-test.cc +++ b/src/parquet/file/file-deserialize-test.cc @@ -30,8 +30,8 @@ #include "parquet/compression.h" #include "parquet/exception.h" #include "parquet/file/reader-internal.h" -#include "parquet/thrift/parquet_types.h" -#include "parquet/thrift/util.h" +#include "parquet/parquet_types.h" +#include "parquet/thrift.h" #include "parquet/types.h" #include "parquet/util/memory.h" #include "parquet/util/test-common.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/file/metadata.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index b4dffde..0fa4e44 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -23,7 +23,7 @@ #include "parquet/file/metadata.h" #include "parquet/schema-internal.h" #include "parquet/schema.h" -#include "parquet/thrift/util.h" +#include "parquet/thrift.h" #include "parquet/util/memory.h" #include <boost/algorithm/string.hpp> http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/file/reader-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc index 425a001..328197a 100644 --- a/src/parquet/file/reader-internal.cc +++ b/src/parquet/file/reader-internal.cc @@ -28,7 +28,7 @@ #include "parquet/compression.h" #include "parquet/exception.h" #include "parquet/schema.h" -#include "parquet/thrift/util.h" +#include "parquet/thrift.h" #include "parquet/types.h" #include "parquet/util/memory.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/file/reader-internal.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.h b/src/parquet/file/reader-internal.h index 4ff065d..9f436ca 100644 --- a/src/parquet/file/reader-internal.h +++ b/src/parquet/file/reader-internal.h @@ -27,7 +27,7 @@ #include "parquet/compression.h" #include "parquet/file/metadata.h" #include "parquet/file/reader.h" -#include "parquet/thrift/parquet_types.h" +#include "parquet/parquet_types.h" #include "parquet/types.h" #include "parquet/util/memory.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/file/writer-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/writer-internal.cc b/src/parquet/file/writer-internal.cc index 56f08f3..877f668 100644 --- a/src/parquet/file/writer-internal.cc +++ b/src/parquet/file/writer-internal.cc @@ -20,7 +20,7 @@ #include "parquet/column/writer.h" #include "parquet/schema-internal.h" #include "parquet/schema.h" -#include "parquet/thrift/util.h" +#include "parquet/thrift.h" #include "parquet/util/memory.h" using parquet::schema::GroupNode; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/file/writer-internal.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/writer-internal.h b/src/parquet/file/writer-internal.h index e711c44..f803f92 100644 --- a/src/parquet/file/writer-internal.h +++ b/src/parquet/file/writer-internal.h @@ -25,7 +25,7 @@ #include "parquet/compression.h" #include "parquet/file/metadata.h" #include "parquet/file/writer.h" -#include "parquet/thrift/parquet_types.h" +#include "parquet/parquet_types.h" #include "parquet/util/memory.h" namespace parquet { http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/parquet.thrift ---------------------------------------------------------------------- diff --git a/src/parquet/parquet.thrift b/src/parquet/parquet.thrift new file mode 100644 index 0000000..b61c084 --- /dev/null +++ b/src/parquet/parquet.thrift @@ -0,0 +1,574 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * File format description for the parquet file format + */ +namespace cpp parquet.format +namespace java parquet.format + +/** + * Types supported by Parquet. These types are intended to be used in combination + * with the encodings to control the on disk storage format. + * For example INT16 is not included as a type since a good encoding of INT32 + * would handle this. + */ +enum Type { + BOOLEAN = 0; + INT32 = 1; + INT64 = 2; + INT96 = 3; + FLOAT = 4; + DOUBLE = 5; + BYTE_ARRAY = 6; + FIXED_LEN_BYTE_ARRAY = 7; +} + +/** + * Common types used by frameworks(e.g. hive, pig) using parquet. This helps map + * between types in those frameworks to the base types in parquet. This is only + * metadata and not needed to read or write the data. + */ +enum ConvertedType { + /** a BYTE_ARRAY actually contains UTF8 encoded chars */ + UTF8 = 0; + + /** a map is converted as an optional field containing a repeated key/value pair */ + MAP = 1; + + /** a key/value pair is converted into a group of two fields */ + MAP_KEY_VALUE = 2; + + /** a list is converted into an optional field containing a repeated field for its + * values */ + LIST = 3; + + /** an enum is converted into a binary field */ + ENUM = 4; + + /** + * A decimal value. + * + * This may be used to annotate binary or fixed primitive types. The + * underlying byte array stores the unscaled value encoded as two's + * complement using big-endian byte order (the most significant byte is the + * zeroth element). The value of the decimal is the value * 10^{-scale}. + * + * This must be accompanied by a (maximum) precision and a scale in the + * SchemaElement. The precision specifies the number of digits in the decimal + * and the scale stores the location of the decimal point. For example 1.23 + * would have precision 3 (3 total digits) and scale 2 (the decimal point is + * 2 digits over). + */ + DECIMAL = 5; + + /** + * A Date + * + * Stored as days since Unix epoch, encoded as the INT32 physical type. + * + */ + DATE = 6; + + /** + * A time + * + * The total number of milliseconds since midnight. The value is stored + * as an INT32 physical type. + */ + TIME_MILLIS = 7; + + /** + * A time. + * + * The total number of microseconds since midnight. The value is stored as + * an INT64 physical type. + */ + TIME_MICROS = 8; + + /** + * A date/time combination + * + * Date and time recorded as milliseconds since the Unix epoch. Recorded as + * a physical type of INT64. + */ + TIMESTAMP_MILLIS = 9; + + /** + * A date/time combination + * + * Date and time recorded as microseconds since the Unix epoch. The value is + * stored as an INT64 physical type. + */ + TIMESTAMP_MICROS = 10; + + + /** + * An unsigned integer value. + * + * The number describes the maximum number of meainful data bits in + * the stored value. 8, 16 and 32 bit values are stored using the + * INT32 physical type. 64 bit values are stored using the INT64 + * physical type. + * + */ + UINT_8 = 11; + UINT_16 = 12; + UINT_32 = 13; + UINT_64 = 14; + + /** + * A signed integer value. + * + * The number describes the maximum number of meainful data bits in + * the stored value. 8, 16 and 32 bit values are stored using the + * INT32 physical type. 64 bit values are stored using the INT64 + * physical type. + * + */ + INT_8 = 15; + INT_16 = 16; + INT_32 = 17; + INT_64 = 18; + + /** + * An embedded JSON document + * + * A JSON document embedded within a single UTF8 column. + */ + JSON = 19; + + /** + * An embedded BSON document + * + * A BSON document embedded within a single BINARY column. + */ + BSON = 20; + + /** + * An interval of time + * + * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 + * This data is composed of three separate little endian unsigned + * integers. Each stores a component of a duration of time. The first + * integer identifies the number of months associated with the duration, + * the second identifies the number of days associated with the duration + * and the third identifies the number of milliseconds associated with + * the provided duration. This duration of time is independent of any + * particular timezone or date. + */ + INTERVAL = 21; + +} + +/** + * Representation of Schemas + */ +enum FieldRepetitionType { + /** This field is required (can not be null) and each record has exactly 1 value. */ + REQUIRED = 0; + + /** The field is optional (can be null) and each record has 0 or 1 values. */ + OPTIONAL = 1; + + /** The field is repeated and can contain 0 or more values */ + REPEATED = 2; +} + +/** + * Statistics per row group and per page + * All fields are optional. + */ +struct Statistics { + /** min and max value of the column, encoded in PLAIN encoding */ + 1: optional binary max; + 2: optional binary min; + /** count of null value in the column */ + 3: optional i64 null_count; + /** count of distinct values occurring */ + 4: optional i64 distinct_count; +} + +/** + * Represents a element inside a schema definition. + * - if it is a group (inner node) then type is undefined and num_children is defined + * - if it is a primitive type (leaf) then type is defined and num_children is undefined + * the nodes are listed in depth first traversal order. + */ +struct SchemaElement { + /** Data type for this field. Not set if the current element is a non-leaf node */ + 1: optional Type type; + + /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. + * Otherwise, if specified, this is the maximum bit length to store any of the values. + * (e.g. a low cardinality INT col could have this set to 3). Note that this is + * in the schema, and therefore fixed for the entire file. + */ + 2: optional i32 type_length; + + /** repetition of the field. The root of the schema does not have a repetition_type. + * All other nodes must have one */ + 3: optional FieldRepetitionType repetition_type; + + /** Name of the field in the schema */ + 4: required string name; + + /** Nested fields. Since thrift does not support nested fields, + * the nesting is flattened to a single list by a depth-first traversal. + * The children count is used to construct the nested relationship. + * This field is not set when the element is a primitive type + */ + 5: optional i32 num_children; + + /** When the schema is the result of a conversion from another model + * Used to record the original type to help with cross conversion. + */ + 6: optional ConvertedType converted_type; + + /** Used when this column contains decimal data. + * See the DECIMAL converted type for more details. + */ + 7: optional i32 scale + 8: optional i32 precision + + /** When the original schema supports field ids, this will save the + * original field id in the parquet schema + */ + 9: optional i32 field_id; + +} + +/** + * Encodings supported by Parquet. Not all encodings are valid for all types. These + * enums are also used to specify the encoding of definition and repetition levels. + * See the accompanying doc for the details of the more complicated encodings. + */ +enum Encoding { + /** Default encoding. + * BOOLEAN - 1 bit per value. 0 is false; 1 is true. + * INT32 - 4 bytes per value. Stored as little-endian. + * INT64 - 8 bytes per value. Stored as little-endian. + * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + * FIXED_LEN_BYTE_ARRAY - Just the bytes. + */ + PLAIN = 0; + + /** Group VarInt encoding for INT32/INT64. + * This encoding is deprecated. It was never used + */ + // GROUP_VAR_INT = 1; + + /** + * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + * plain type. + * in a data page use RLE_DICTIONARY instead. + * in a Dictionary page use PLAIN instead + */ + PLAIN_DICTIONARY = 2; + + /** Group packed run length encoding. Usable for definition/reptition levels + * encoding and Booleans (on one bit: 0 is false; 1 is true.) + */ + RLE = 3; + + /** Bit packed encoding. This can only be used if the data has a known max + * width. Usable for definition/repetition levels encoding. + */ + BIT_PACKED = 4; + + /** Delta encoding for integers. This can be used for int columns and works best + * on sorted data + */ + DELTA_BINARY_PACKED = 5; + + /** Encoding for byte arrays to separate the length values and the data. The lengths + * are encoded using DELTA_BINARY_PACKED + */ + DELTA_LENGTH_BYTE_ARRAY = 6; + + /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. + * Suffixes are stored as delta length byte arrays. + */ + DELTA_BYTE_ARRAY = 7; + + /** Dictionary encoding: the ids are encoded using the RLE encoding + */ + RLE_DICTIONARY = 8; +} + +/** + * Supported compression algorithms. + */ +enum CompressionCodec { + UNCOMPRESSED = 0; + SNAPPY = 1; + GZIP = 2; + LZO = 3; + BROTLI = 4; +} + +enum PageType { + DATA_PAGE = 0; + INDEX_PAGE = 1; + DICTIONARY_PAGE = 2; + DATA_PAGE_V2 = 3; +} + +/** Data page header */ +struct DataPageHeader { + /** Number of values, including NULLs, in this data page. **/ + 1: required i32 num_values + + /** Encoding used for this data page **/ + 2: required Encoding encoding + + /** Encoding used for definition levels **/ + 3: required Encoding definition_level_encoding; + + /** Encoding used for repetition levels **/ + 4: required Encoding repetition_level_encoding; + + /** Optional statistics for the data in this page**/ + 5: optional Statistics statistics; +} + +struct IndexPageHeader { + /** TODO: **/ +} + +struct DictionaryPageHeader { + /** Number of values in the dictionary **/ + 1: required i32 num_values; + + /** Encoding using this dictionary page **/ + 2: required Encoding encoding + + /** If true, the entries in the dictionary are sorted in ascending order **/ + 3: optional bool is_sorted; +} + +/** + * New page format alowing reading levels without decompressing the data + * Repetition and definition levels are uncompressed + * The remaining section containing the data is compressed if is_compressed is true + **/ +struct DataPageHeaderV2 { + /** Number of values, including NULLs, in this data page. **/ + 1: required i32 num_values + /** Number of NULL values, in this data page. + Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ + 2: required i32 num_nulls + /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ + 3: required i32 num_rows + /** Encoding used for data in this page **/ + 4: required Encoding encoding + + // repetition levels and definition levels are always using RLE (without size in it) + + /** length of the repetition levels */ + 5: required i32 definition_levels_byte_length; + /** length of the definition levels */ + 6: required i32 repetition_levels_byte_length; + + /** whether the values are compressed. + Which means the section of the page between + definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) + is compressed with the compression_codec. + If missing it is considered compressed */ + 7: optional bool is_compressed = 1; + + /** optional statistics for this column chunk */ + 8: optional Statistics statistics; +} + +struct PageHeader { + /** the type of the page: indicates which of the *_header fields is set **/ + 1: required PageType type + + /** Uncompressed page size in bytes (not including this header) **/ + 2: required i32 uncompressed_page_size + + /** Compressed page size in bytes (not including this header) **/ + 3: required i32 compressed_page_size + + /** 32bit crc for the data below. This allows for disabling checksumming in HDFS + * if only a few pages needs to be read + **/ + 4: optional i32 crc + + // Headers for page specific data. One only will be set. + 5: optional DataPageHeader data_page_header; + 6: optional IndexPageHeader index_page_header; + 7: optional DictionaryPageHeader dictionary_page_header; + 8: optional DataPageHeaderV2 data_page_header_v2; +} + +/** + * Wrapper struct to store key values + */ + struct KeyValue { + 1: required string key + 2: optional string value +} + +/** + * Wrapper struct to specify sort order + */ +struct SortingColumn { + /** The column index (in this row group) **/ + 1: required i32 column_idx + + /** If true, indicates this column is sorted in descending order. **/ + 2: required bool descending + + /** If true, nulls will come before non-null values, otherwise, + * nulls go at the end. */ + 3: required bool nulls_first +} + +/** + * statistics of a given page type and encoding + */ +struct PageEncodingStats { + + /** the page type (data/dic/...) **/ + 1: required PageType page_type; + + /** encoding of the page **/ + 2: required Encoding encoding; + + /** number of pages of this type with this encoding **/ + 3: required i32 count; + +} + +/** + * Description for column metadata + */ +struct ColumnMetaData { + /** Type of this column **/ + 1: required Type type + + /** Set of all encodings used for this column. The purpose is to validate + * whether we can decode those pages. **/ + 2: required list<Encoding> encodings + + /** Path in schema **/ + 3: required list<string> path_in_schema + + /** Compression codec **/ + 4: required CompressionCodec codec + + /** Number of values in this column **/ + 5: required i64 num_values + + /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ + 6: required i64 total_uncompressed_size + + /** total byte size of all compressed pages in this column chunk (including the headers) **/ + 7: required i64 total_compressed_size + + /** Optional key/value metadata **/ + 8: optional list<KeyValue> key_value_metadata + + /** Byte offset from beginning of file to first data page **/ + 9: required i64 data_page_offset + + /** Byte offset from beginning of file to root index page **/ + 10: optional i64 index_page_offset + + /** Byte offset from the beginning of file to first (only) dictionary page **/ + 11: optional i64 dictionary_page_offset + + /** optional statistics for this column chunk */ + 12: optional Statistics statistics; + + /** Set of all encodings used for pages in this column chunk. + * This information can be used to determine if all data pages are + * dictionary encoded for example **/ + 13: optional list<PageEncodingStats> encoding_stats; +} + +struct ColumnChunk { + /** File where column data is stored. If not set, assumed to be same file as + * metadata. This path is relative to the current file. + **/ + 1: optional string file_path + + /** Byte offset in file_path to the ColumnMetaData **/ + 2: required i64 file_offset + + /** Column metadata for this chunk. This is the same content as what is at + * file_path/file_offset. Having it here has it replicated in the file + * metadata. + **/ + 3: optional ColumnMetaData meta_data +} + +struct RowGroup { + /** Metadata for each column chunk in this row group. + * This list must have the same order as the SchemaElement list in FileMetaData. + **/ + 1: required list<ColumnChunk> columns + + /** Total byte size of all the uncompressed column data in this row group **/ + 2: required i64 total_byte_size + + /** Number of rows in this row group **/ + 3: required i64 num_rows + + /** If set, specifies a sort ordering of the rows in this RowGroup. + * The sorting columns can be a subset of all the columns. + */ + 4: optional list<SortingColumn> sorting_columns +} + +/** + * Description for file metadata + */ +struct FileMetaData { + /** Version of this file **/ + 1: required i32 version + + /** Parquet schema for this file. This schema contains metadata for all the columns. + * The schema is represented as a tree with a single root. The nodes of the tree + * are flattened to a list by doing a depth-first traversal. + * The column metadata contains the path in the schema for that column which can be + * used to map columns to nodes in the schema. + * The first element is the root **/ + 2: required list<SchemaElement> schema; + + /** Number of rows in this file **/ + 3: required i64 num_rows + + /** Row groups in this file **/ + 4: required list<RowGroup> row_groups + + /** Optional key/value metadata **/ + 5: optional list<KeyValue> key_value_metadata + + /** String for application that wrote this file. This should be in the format + * <Application> version <App Version> (build <App Build Hash>). + * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) + **/ + 6: optional string created_by +} + http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/schema-internal.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema-internal.h b/src/parquet/schema-internal.h index 5ceaa0c..53472ab 100644 --- a/src/parquet/schema-internal.h +++ b/src/parquet/schema-internal.h @@ -25,8 +25,8 @@ #include <memory> #include <vector> +#include "parquet/parquet_types.h" #include "parquet/schema.h" -#include "parquet/thrift/parquet_types.h" #include "parquet/types.h" #include "parquet/util/macros.h" #include "parquet/util/visibility.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/schema-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema-test.cc b/src/parquet/schema-test.cc index b092b38..b894cd0 100644 --- a/src/parquet/schema-test.cc +++ b/src/parquet/schema-test.cc @@ -24,9 +24,9 @@ #include <vector> #include "parquet/exception.h" +#include "parquet/parquet_types.h" #include "parquet/schema-internal.h" #include "parquet/schema.h" -#include "parquet/thrift/parquet_types.h" #include "parquet/types.h" using std::string; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/schema.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema.cc b/src/parquet/schema.cc index 13fca68..e6380ae 100644 --- a/src/parquet/schema.cc +++ b/src/parquet/schema.cc @@ -22,8 +22,8 @@ #include <memory> #include "parquet/exception.h" -#include "parquet/thrift/parquet_types.h" -#include "parquet/thrift/util.h" +#include "parquet/parquet_types.h" +#include "parquet/thrift.h" using parquet::format::SchemaElement; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/thrift.h ---------------------------------------------------------------------- diff --git a/src/parquet/thrift.h b/src/parquet/thrift.h new file mode 100644 index 0000000..7fa0de3 --- /dev/null +++ b/src/parquet/thrift.h @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_THRIFT_UTIL_H +#define PARQUET_THRIFT_UTIL_H + +#include <cstdint> + +// Needed for thrift +#include <boost/shared_ptr.hpp> + +// TCompactProtocol requires some #defines to work right. +#define SIGNED_RIGHT_SHIFT_IS 1 +#define ARITHMETIC_RIGHT_SHIFT 1 +#include <thrift/TApplicationException.h> +#include <thrift/protocol/TCompactProtocol.h> +#include <thrift/protocol/TDebugProtocol.h> + +#include <sstream> +#include <thrift/protocol/TBinaryProtocol.h> +#include <thrift/transport/TBufferTransports.h> + +#include "parquet/exception.h" +#include "parquet/parquet_types.h" +#include "parquet/util/logging.h" +#include "parquet/util/memory.h" + +namespace parquet { + +// ---------------------------------------------------------------------- +// Convert Thrift enums to / from parquet enums + +static inline Type::type FromThrift(format::Type::type type) { + return static_cast<Type::type>(type); +} + +static inline LogicalType::type FromThrift(format::ConvertedType::type type) { + // item 0 is NONE + return static_cast<LogicalType::type>(static_cast<int>(type) + 1); +} + +static inline Repetition::type FromThrift(format::FieldRepetitionType::type type) { + return static_cast<Repetition::type>(type); +} + +static inline Encoding::type FromThrift(format::Encoding::type type) { + return static_cast<Encoding::type>(type); +} + +static inline Compression::type FromThrift(format::CompressionCodec::type type) { + return static_cast<Compression::type>(type); +} + +static inline format::Type::type ToThrift(Type::type type) { + return static_cast<format::Type::type>(type); +} + +static inline format::ConvertedType::type ToThrift(LogicalType::type type) { + // item 0 is NONE + DCHECK_NE(type, LogicalType::NONE); + return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1); +} + +static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) { + return static_cast<format::FieldRepetitionType::type>(type); +} + +static inline format::Encoding::type ToThrift(Encoding::type type) { + return static_cast<format::Encoding::type>(type); +} + +static inline format::CompressionCodec::type ToThrift(Compression::type type) { + return static_cast<format::CompressionCodec::type>(type); +} + +// ---------------------------------------------------------------------- +// Thrift struct serialization / deserialization utilities + +// Deserialize a thrift message from buf/len. buf/len must at least contain +// all the bytes needed to store the thrift message. On return, len will be +// set to the actual length of the header. +template <class T> +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { + // Deserialize msg bytes into c++ thrift msg using memory transport. + boost::shared_ptr<apache::thrift::transport::TMemoryBuffer> tmem_transport( + new apache::thrift::transport::TMemoryBuffer(const_cast<uint8_t*>(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + boost::shared_ptr<apache::thrift::protocol::TProtocol> tproto = + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; +} + +// Serialize obj into a buffer. The result is returned as a string. +// The arguments are the object to be serialized and +// the expected size of the serialized object +template <class T> +inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { + boost::shared_ptr<apache::thrift::transport::TMemoryBuffer> mem_buffer( + new apache::thrift::transport::TMemoryBuffer(len)); + apache::thrift::protocol::TCompactProtocolFactoryT< + apache::thrift::transport::TMemoryBuffer> + tproto_factory; + boost::shared_ptr<apache::thrift::protocol::TProtocol> tproto = + tproto_factory.getProtocol(mem_buffer); + try { + mem_buffer->resetBuffer(); + obj->write(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't serialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + + uint8_t* out_buffer; + uint32_t out_length; + mem_buffer->getBuffer(&out_buffer, &out_length); + out->Write(out_buffer, out_length); + return out_length; +} + +} // namespace parquet + +#endif // PARQUET_THRIFT_UTIL_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/thrift/.gitignore ---------------------------------------------------------------------- diff --git a/src/parquet/thrift/.gitignore b/src/parquet/thrift/.gitignore deleted file mode 100644 index 0695270..0000000 --- a/src/parquet/thrift/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -parquet_constants.* -parquet_types.* \ No newline at end of file http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/thrift/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/thrift/CMakeLists.txt b/src/parquet/thrift/CMakeLists.txt deleted file mode 100644 index 78d9cd7..0000000 --- a/src/parquet/thrift/CMakeLists.txt +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set(THRIFT_SRCS - parquet_constants.cpp - parquet_types.cpp) - -set_source_files_properties(parquet_types.cpp PROPERTIES - COMPILE_FLAGS -Wno-unused-variable) - -add_library(parquet_thrift STATIC - ${THRIFT_SRCS} -) - -set_target_properties(parquet_thrift - PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") -set_source_files_properties(${THRIFT_SRCS} PROPERTIES GENERATED TRUE) - -# List of thrift output targets -set(OUTPUT_DIR ${CMAKE_SOURCE_DIR}/src/parquet/thrift) -set(THRIFT_OUTPUT_FILES "${OUTPUT_DIR}/parquet_types.cpp") -set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${OUTPUT_DIR}/parquet_types.h") -set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${OUTPUT_DIR}/parquet_constants.cpp") -set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${OUTPUT_DIR}/parquet_constants.h") - -get_filename_component(ABS_PARQUET_THRIFT parquet.thrift ABSOLUTE) - -add_custom_command( - OUTPUT ${THRIFT_OUTPUT_FILES} - COMMAND ${THRIFT_COMPILER} --gen cpp -out ${OUTPUT_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/parquet.thrift - DEPENDS ${ABS_PARQUET_THRIFT} thriftstatic - COMMENT "Running thrift compiler on parquet.thrift" - VERBATIM -) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/thrift/parquet.thrift ---------------------------------------------------------------------- diff --git a/src/parquet/thrift/parquet.thrift b/src/parquet/thrift/parquet.thrift deleted file mode 100644 index b61c084..0000000 --- a/src/parquet/thrift/parquet.thrift +++ /dev/null @@ -1,574 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/** - * File format description for the parquet file format - */ -namespace cpp parquet.format -namespace java parquet.format - -/** - * Types supported by Parquet. These types are intended to be used in combination - * with the encodings to control the on disk storage format. - * For example INT16 is not included as a type since a good encoding of INT32 - * would handle this. - */ -enum Type { - BOOLEAN = 0; - INT32 = 1; - INT64 = 2; - INT96 = 3; - FLOAT = 4; - DOUBLE = 5; - BYTE_ARRAY = 6; - FIXED_LEN_BYTE_ARRAY = 7; -} - -/** - * Common types used by frameworks(e.g. hive, pig) using parquet. This helps map - * between types in those frameworks to the base types in parquet. This is only - * metadata and not needed to read or write the data. - */ -enum ConvertedType { - /** a BYTE_ARRAY actually contains UTF8 encoded chars */ - UTF8 = 0; - - /** a map is converted as an optional field containing a repeated key/value pair */ - MAP = 1; - - /** a key/value pair is converted into a group of two fields */ - MAP_KEY_VALUE = 2; - - /** a list is converted into an optional field containing a repeated field for its - * values */ - LIST = 3; - - /** an enum is converted into a binary field */ - ENUM = 4; - - /** - * A decimal value. - * - * This may be used to annotate binary or fixed primitive types. The - * underlying byte array stores the unscaled value encoded as two's - * complement using big-endian byte order (the most significant byte is the - * zeroth element). The value of the decimal is the value * 10^{-scale}. - * - * This must be accompanied by a (maximum) precision and a scale in the - * SchemaElement. The precision specifies the number of digits in the decimal - * and the scale stores the location of the decimal point. For example 1.23 - * would have precision 3 (3 total digits) and scale 2 (the decimal point is - * 2 digits over). - */ - DECIMAL = 5; - - /** - * A Date - * - * Stored as days since Unix epoch, encoded as the INT32 physical type. - * - */ - DATE = 6; - - /** - * A time - * - * The total number of milliseconds since midnight. The value is stored - * as an INT32 physical type. - */ - TIME_MILLIS = 7; - - /** - * A time. - * - * The total number of microseconds since midnight. The value is stored as - * an INT64 physical type. - */ - TIME_MICROS = 8; - - /** - * A date/time combination - * - * Date and time recorded as milliseconds since the Unix epoch. Recorded as - * a physical type of INT64. - */ - TIMESTAMP_MILLIS = 9; - - /** - * A date/time combination - * - * Date and time recorded as microseconds since the Unix epoch. The value is - * stored as an INT64 physical type. - */ - TIMESTAMP_MICROS = 10; - - - /** - * An unsigned integer value. - * - * The number describes the maximum number of meainful data bits in - * the stored value. 8, 16 and 32 bit values are stored using the - * INT32 physical type. 64 bit values are stored using the INT64 - * physical type. - * - */ - UINT_8 = 11; - UINT_16 = 12; - UINT_32 = 13; - UINT_64 = 14; - - /** - * A signed integer value. - * - * The number describes the maximum number of meainful data bits in - * the stored value. 8, 16 and 32 bit values are stored using the - * INT32 physical type. 64 bit values are stored using the INT64 - * physical type. - * - */ - INT_8 = 15; - INT_16 = 16; - INT_32 = 17; - INT_64 = 18; - - /** - * An embedded JSON document - * - * A JSON document embedded within a single UTF8 column. - */ - JSON = 19; - - /** - * An embedded BSON document - * - * A BSON document embedded within a single BINARY column. - */ - BSON = 20; - - /** - * An interval of time - * - * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 - * This data is composed of three separate little endian unsigned - * integers. Each stores a component of a duration of time. The first - * integer identifies the number of months associated with the duration, - * the second identifies the number of days associated with the duration - * and the third identifies the number of milliseconds associated with - * the provided duration. This duration of time is independent of any - * particular timezone or date. - */ - INTERVAL = 21; - -} - -/** - * Representation of Schemas - */ -enum FieldRepetitionType { - /** This field is required (can not be null) and each record has exactly 1 value. */ - REQUIRED = 0; - - /** The field is optional (can be null) and each record has 0 or 1 values. */ - OPTIONAL = 1; - - /** The field is repeated and can contain 0 or more values */ - REPEATED = 2; -} - -/** - * Statistics per row group and per page - * All fields are optional. - */ -struct Statistics { - /** min and max value of the column, encoded in PLAIN encoding */ - 1: optional binary max; - 2: optional binary min; - /** count of null value in the column */ - 3: optional i64 null_count; - /** count of distinct values occurring */ - 4: optional i64 distinct_count; -} - -/** - * Represents a element inside a schema definition. - * - if it is a group (inner node) then type is undefined and num_children is defined - * - if it is a primitive type (leaf) then type is defined and num_children is undefined - * the nodes are listed in depth first traversal order. - */ -struct SchemaElement { - /** Data type for this field. Not set if the current element is a non-leaf node */ - 1: optional Type type; - - /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. - * Otherwise, if specified, this is the maximum bit length to store any of the values. - * (e.g. a low cardinality INT col could have this set to 3). Note that this is - * in the schema, and therefore fixed for the entire file. - */ - 2: optional i32 type_length; - - /** repetition of the field. The root of the schema does not have a repetition_type. - * All other nodes must have one */ - 3: optional FieldRepetitionType repetition_type; - - /** Name of the field in the schema */ - 4: required string name; - - /** Nested fields. Since thrift does not support nested fields, - * the nesting is flattened to a single list by a depth-first traversal. - * The children count is used to construct the nested relationship. - * This field is not set when the element is a primitive type - */ - 5: optional i32 num_children; - - /** When the schema is the result of a conversion from another model - * Used to record the original type to help with cross conversion. - */ - 6: optional ConvertedType converted_type; - - /** Used when this column contains decimal data. - * See the DECIMAL converted type for more details. - */ - 7: optional i32 scale - 8: optional i32 precision - - /** When the original schema supports field ids, this will save the - * original field id in the parquet schema - */ - 9: optional i32 field_id; - -} - -/** - * Encodings supported by Parquet. Not all encodings are valid for all types. These - * enums are also used to specify the encoding of definition and repetition levels. - * See the accompanying doc for the details of the more complicated encodings. - */ -enum Encoding { - /** Default encoding. - * BOOLEAN - 1 bit per value. 0 is false; 1 is true. - * INT32 - 4 bytes per value. Stored as little-endian. - * INT64 - 8 bytes per value. Stored as little-endian. - * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. - * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. - * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. - * FIXED_LEN_BYTE_ARRAY - Just the bytes. - */ - PLAIN = 0; - - /** Group VarInt encoding for INT32/INT64. - * This encoding is deprecated. It was never used - */ - // GROUP_VAR_INT = 1; - - /** - * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the - * plain type. - * in a data page use RLE_DICTIONARY instead. - * in a Dictionary page use PLAIN instead - */ - PLAIN_DICTIONARY = 2; - - /** Group packed run length encoding. Usable for definition/reptition levels - * encoding and Booleans (on one bit: 0 is false; 1 is true.) - */ - RLE = 3; - - /** Bit packed encoding. This can only be used if the data has a known max - * width. Usable for definition/repetition levels encoding. - */ - BIT_PACKED = 4; - - /** Delta encoding for integers. This can be used for int columns and works best - * on sorted data - */ - DELTA_BINARY_PACKED = 5; - - /** Encoding for byte arrays to separate the length values and the data. The lengths - * are encoded using DELTA_BINARY_PACKED - */ - DELTA_LENGTH_BYTE_ARRAY = 6; - - /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. - * Suffixes are stored as delta length byte arrays. - */ - DELTA_BYTE_ARRAY = 7; - - /** Dictionary encoding: the ids are encoded using the RLE encoding - */ - RLE_DICTIONARY = 8; -} - -/** - * Supported compression algorithms. - */ -enum CompressionCodec { - UNCOMPRESSED = 0; - SNAPPY = 1; - GZIP = 2; - LZO = 3; - BROTLI = 4; -} - -enum PageType { - DATA_PAGE = 0; - INDEX_PAGE = 1; - DICTIONARY_PAGE = 2; - DATA_PAGE_V2 = 3; -} - -/** Data page header */ -struct DataPageHeader { - /** Number of values, including NULLs, in this data page. **/ - 1: required i32 num_values - - /** Encoding used for this data page **/ - 2: required Encoding encoding - - /** Encoding used for definition levels **/ - 3: required Encoding definition_level_encoding; - - /** Encoding used for repetition levels **/ - 4: required Encoding repetition_level_encoding; - - /** Optional statistics for the data in this page**/ - 5: optional Statistics statistics; -} - -struct IndexPageHeader { - /** TODO: **/ -} - -struct DictionaryPageHeader { - /** Number of values in the dictionary **/ - 1: required i32 num_values; - - /** Encoding using this dictionary page **/ - 2: required Encoding encoding - - /** If true, the entries in the dictionary are sorted in ascending order **/ - 3: optional bool is_sorted; -} - -/** - * New page format alowing reading levels without decompressing the data - * Repetition and definition levels are uncompressed - * The remaining section containing the data is compressed if is_compressed is true - **/ -struct DataPageHeaderV2 { - /** Number of values, including NULLs, in this data page. **/ - 1: required i32 num_values - /** Number of NULL values, in this data page. - Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ - 2: required i32 num_nulls - /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ - 3: required i32 num_rows - /** Encoding used for data in this page **/ - 4: required Encoding encoding - - // repetition levels and definition levels are always using RLE (without size in it) - - /** length of the repetition levels */ - 5: required i32 definition_levels_byte_length; - /** length of the definition levels */ - 6: required i32 repetition_levels_byte_length; - - /** whether the values are compressed. - Which means the section of the page between - definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) - is compressed with the compression_codec. - If missing it is considered compressed */ - 7: optional bool is_compressed = 1; - - /** optional statistics for this column chunk */ - 8: optional Statistics statistics; -} - -struct PageHeader { - /** the type of the page: indicates which of the *_header fields is set **/ - 1: required PageType type - - /** Uncompressed page size in bytes (not including this header) **/ - 2: required i32 uncompressed_page_size - - /** Compressed page size in bytes (not including this header) **/ - 3: required i32 compressed_page_size - - /** 32bit crc for the data below. This allows for disabling checksumming in HDFS - * if only a few pages needs to be read - **/ - 4: optional i32 crc - - // Headers for page specific data. One only will be set. - 5: optional DataPageHeader data_page_header; - 6: optional IndexPageHeader index_page_header; - 7: optional DictionaryPageHeader dictionary_page_header; - 8: optional DataPageHeaderV2 data_page_header_v2; -} - -/** - * Wrapper struct to store key values - */ - struct KeyValue { - 1: required string key - 2: optional string value -} - -/** - * Wrapper struct to specify sort order - */ -struct SortingColumn { - /** The column index (in this row group) **/ - 1: required i32 column_idx - - /** If true, indicates this column is sorted in descending order. **/ - 2: required bool descending - - /** If true, nulls will come before non-null values, otherwise, - * nulls go at the end. */ - 3: required bool nulls_first -} - -/** - * statistics of a given page type and encoding - */ -struct PageEncodingStats { - - /** the page type (data/dic/...) **/ - 1: required PageType page_type; - - /** encoding of the page **/ - 2: required Encoding encoding; - - /** number of pages of this type with this encoding **/ - 3: required i32 count; - -} - -/** - * Description for column metadata - */ -struct ColumnMetaData { - /** Type of this column **/ - 1: required Type type - - /** Set of all encodings used for this column. The purpose is to validate - * whether we can decode those pages. **/ - 2: required list<Encoding> encodings - - /** Path in schema **/ - 3: required list<string> path_in_schema - - /** Compression codec **/ - 4: required CompressionCodec codec - - /** Number of values in this column **/ - 5: required i64 num_values - - /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ - 6: required i64 total_uncompressed_size - - /** total byte size of all compressed pages in this column chunk (including the headers) **/ - 7: required i64 total_compressed_size - - /** Optional key/value metadata **/ - 8: optional list<KeyValue> key_value_metadata - - /** Byte offset from beginning of file to first data page **/ - 9: required i64 data_page_offset - - /** Byte offset from beginning of file to root index page **/ - 10: optional i64 index_page_offset - - /** Byte offset from the beginning of file to first (only) dictionary page **/ - 11: optional i64 dictionary_page_offset - - /** optional statistics for this column chunk */ - 12: optional Statistics statistics; - - /** Set of all encodings used for pages in this column chunk. - * This information can be used to determine if all data pages are - * dictionary encoded for example **/ - 13: optional list<PageEncodingStats> encoding_stats; -} - -struct ColumnChunk { - /** File where column data is stored. If not set, assumed to be same file as - * metadata. This path is relative to the current file. - **/ - 1: optional string file_path - - /** Byte offset in file_path to the ColumnMetaData **/ - 2: required i64 file_offset - - /** Column metadata for this chunk. This is the same content as what is at - * file_path/file_offset. Having it here has it replicated in the file - * metadata. - **/ - 3: optional ColumnMetaData meta_data -} - -struct RowGroup { - /** Metadata for each column chunk in this row group. - * This list must have the same order as the SchemaElement list in FileMetaData. - **/ - 1: required list<ColumnChunk> columns - - /** Total byte size of all the uncompressed column data in this row group **/ - 2: required i64 total_byte_size - - /** Number of rows in this row group **/ - 3: required i64 num_rows - - /** If set, specifies a sort ordering of the rows in this RowGroup. - * The sorting columns can be a subset of all the columns. - */ - 4: optional list<SortingColumn> sorting_columns -} - -/** - * Description for file metadata - */ -struct FileMetaData { - /** Version of this file **/ - 1: required i32 version - - /** Parquet schema for this file. This schema contains metadata for all the columns. - * The schema is represented as a tree with a single root. The nodes of the tree - * are flattened to a list by doing a depth-first traversal. - * The column metadata contains the path in the schema for that column which can be - * used to map columns to nodes in the schema. - * The first element is the root **/ - 2: required list<SchemaElement> schema; - - /** Number of rows in this file **/ - 3: required i64 num_rows - - /** Row groups in this file **/ - 4: required list<RowGroup> row_groups - - /** Optional key/value metadata **/ - 5: optional list<KeyValue> key_value_metadata - - /** String for application that wrote this file. This should be in the format - * <Application> version <App Version> (build <App Build Hash>). - * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) - **/ - 6: optional string created_by -} - http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b1c85caf/src/parquet/thrift/util.h ---------------------------------------------------------------------- diff --git a/src/parquet/thrift/util.h b/src/parquet/thrift/util.h deleted file mode 100644 index 30d7edf..0000000 --- a/src/parquet/thrift/util.h +++ /dev/null @@ -1,147 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_THRIFT_UTIL_H -#define PARQUET_THRIFT_UTIL_H - -#include <cstdint> - -// Needed for thrift -#include <boost/shared_ptr.hpp> - -// TCompactProtocol requires some #defines to work right. -#define SIGNED_RIGHT_SHIFT_IS 1 -#define ARITHMETIC_RIGHT_SHIFT 1 -#include <thrift/TApplicationException.h> -#include <thrift/protocol/TCompactProtocol.h> -#include <thrift/protocol/TDebugProtocol.h> - -#include <sstream> -#include <thrift/protocol/TBinaryProtocol.h> -#include <thrift/transport/TBufferTransports.h> - -#include "parquet/exception.h" -#include "parquet/thrift/parquet_types.h" -#include "parquet/util/logging.h" -#include "parquet/util/memory.h" - -namespace parquet { - -// ---------------------------------------------------------------------- -// Convert Thrift enums to / from parquet enums - -static inline Type::type FromThrift(format::Type::type type) { - return static_cast<Type::type>(type); -} - -static inline LogicalType::type FromThrift(format::ConvertedType::type type) { - // item 0 is NONE - return static_cast<LogicalType::type>(static_cast<int>(type) + 1); -} - -static inline Repetition::type FromThrift(format::FieldRepetitionType::type type) { - return static_cast<Repetition::type>(type); -} - -static inline Encoding::type FromThrift(format::Encoding::type type) { - return static_cast<Encoding::type>(type); -} - -static inline Compression::type FromThrift(format::CompressionCodec::type type) { - return static_cast<Compression::type>(type); -} - -static inline format::Type::type ToThrift(Type::type type) { - return static_cast<format::Type::type>(type); -} - -static inline format::ConvertedType::type ToThrift(LogicalType::type type) { - // item 0 is NONE - DCHECK_NE(type, LogicalType::NONE); - return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1); -} - -static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) { - return static_cast<format::FieldRepetitionType::type>(type); -} - -static inline format::Encoding::type ToThrift(Encoding::type type) { - return static_cast<format::Encoding::type>(type); -} - -static inline format::CompressionCodec::type ToThrift(Compression::type type) { - return static_cast<format::CompressionCodec::type>(type); -} - -// ---------------------------------------------------------------------- -// Thrift struct serialization / deserialization utilities - -// Deserialize a thrift message from buf/len. buf/len must at least contain -// all the bytes needed to store the thrift message. On return, len will be -// set to the actual length of the header. -template <class T> -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - boost::shared_ptr<apache::thrift::transport::TMemoryBuffer> tmem_transport( - new apache::thrift::transport::TMemoryBuffer(const_cast<uint8_t*>(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - boost::shared_ptr<apache::thrift::protocol::TProtocol> tproto = - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; -} - -// Serialize obj into a buffer. The result is returned as a string. -// The arguments are the object to be serialized and -// the expected size of the serialized object -template <class T> -inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { - boost::shared_ptr<apache::thrift::transport::TMemoryBuffer> mem_buffer( - new apache::thrift::transport::TMemoryBuffer(len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - boost::shared_ptr<apache::thrift::protocol::TProtocol> tproto = - tproto_factory.getProtocol(mem_buffer); - try { - mem_buffer->resetBuffer(); - obj->write(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't serialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - - uint8_t* out_buffer; - uint32_t out_length; - mem_buffer->getBuffer(&out_buffer, &out_length); - out->Write(out_buffer, out_length); - return out_length; -} - -} // namespace parquet - -#endif // PARQUET_THRIFT_UTIL_H
