This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 54e28b0b9 ORC-1546: [C++] Use `orc-format` 1.0.0-alpha and remove 
`proto` module
54e28b0b9 is described below

commit 54e28b0b9a83f79932c00427fb0203af257d7d71
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Wed Dec 6 13:13:47 2023 -0800

    ORC-1546: [C++] Use `orc-format` 1.0.0-alpha and remove `proto` module
    
    ### What changes were proposed in this pull request?
    
    This PR aims to use `orc-format` 1.0.0-alpha and remove `proto` module 
completely.
    - https://github.com/apache/orc-format/releases/tag/v1.0.0-alpha
    
    ### Why are the changes needed?
    
    To verity C++ module to use `orc-format` successfully.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    Closes #1684 from dongjoon-hyun/ORC-1546.
    
    Authored-by: Dongjoon Hyun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 c++/src/CMakeLists.txt                  |   4 +-
 cmake_modules/ThirdpartyToolchain.cmake |  12 +
 proto/orc_proto.proto                   | 455 --------------------------------
 3 files changed, 14 insertions(+), 457 deletions(-)

diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 972f2fc03..63c2043af 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -146,9 +146,9 @@ include_directories (
 
 add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc
    COMMAND ${PROTOBUF_EXECUTABLE}
-        -I ${PROJECT_SOURCE_DIR}/proto
+        -I ../../orc-format_ep-prefix/src/orc-format_ep/src/main/proto
         --cpp_out="${CMAKE_CURRENT_BINARY_DIR}"
-        "${PROJECT_SOURCE_DIR}/proto/orc_proto.proto"
+        
../../orc-format_ep-prefix/src/orc-format_ep/src/main/proto/orc_proto.proto
 )
 
 set(SOURCE_FILES
diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
index 47c1a65ba..d92c926eb 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+set(ORC_FORMAT_VERSION "1.0.0-alpha")
 set(LZ4_VERSION "1.9.3")
 set(SNAPPY_VERSION "1.1.7")
 set(ZLIB_VERSION "1.2.11")
@@ -68,6 +69,17 @@ if (DEFINED ENV{GTEST_HOME})
   set (GTEST_HOME "$ENV{GTEST_HOME}")
 endif ()
 
+# ----------------------------------------------------------------------
+# ORC Format
+ExternalProject_Add (orc-format_ep
+  URL 
"https://github.com/apache/orc-format/archive/refs/tags/v${ORC_FORMAT_VERSION}.tar.gz";
+  URL_HASH 
SHA256=d04e878feec01dd9a3ce20553c0bfc70e856a319fe8693725a699bb077d0d286
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND     ""
+  TEST_COMMAND     ""
+)
+
 # ----------------------------------------------------------------------
 # Snappy
 
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
deleted file mode 100644
index 45d7d2a05..000000000
--- a/proto/orc_proto.proto
+++ /dev/null
@@ -1,455 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-syntax = "proto2";
-
-package orc.proto;
-
-option java_package = "org.apache.orc";
-
-message IntegerStatistics  {
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
-  optional double minimum = 1;
-  optional double maximum = 2;
-  optional double sum = 3;
-}
-
-message StringStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  // sum will store the total length of all strings in a stripe
-  optional sint64 sum = 3;
-  // If the minimum or maximum value was longer than 1024 bytes, store a lower 
or upper
-  // bound instead of the minimum or maximum values above.
-  optional string lowerBound = 4;
-  optional string upperBound = 5;
-}
-
-message BucketStatistics {
-  repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  optional string sum = 3;
-}
-
-message DateStatistics {
-  // min,max values saved as days since epoch
-  optional sint32 minimum = 1;
-  optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
-  // min,max values saved as milliseconds since epoch
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 minimumUtc = 3;
-  optional sint64 maximumUtc = 4;
-  // store the lower 6 TS digits for min/max to achieve nanosecond precision
-  optional int32 minimumNanos = 5;
-  optional int32 maximumNanos = 6;
-}
-
-message BinaryStatistics {
-  // sum will store the total binary blob length in a stripe
-  optional sint64 sum = 1;
-}
-
-// Statistics for list and map
-message CollectionStatistics {
-  optional uint64 minChildren = 1;
-  optional uint64 maxChildren = 2;
-  optional uint64 totalChildren = 3;
-}
-
-message ColumnStatistics {
-  optional uint64 numberOfValues = 1;
-  optional IntegerStatistics intStatistics = 2;
-  optional DoubleStatistics doubleStatistics = 3;
-  optional StringStatistics stringStatistics = 4;
-  optional BucketStatistics bucketStatistics = 5;
-  optional DecimalStatistics decimalStatistics = 6;
-  optional DateStatistics dateStatistics = 7;
-  optional BinaryStatistics binaryStatistics = 8;
-  optional TimestampStatistics timestampStatistics = 9;
-  optional bool hasNull = 10;
-  optional uint64 bytesOnDisk = 11;
-  optional CollectionStatistics collectionStatistics = 12;
-}
-
-message RowIndexEntry {
-  repeated uint64 positions = 1 [packed=true];
-  optional ColumnStatistics statistics = 2;
-}
-
-message RowIndex {
-  repeated RowIndexEntry entry = 1;
-}
-
-message BloomFilter {
-  optional uint32 numHashFunctions = 1;
-  repeated fixed64 bitset = 2;
-  optional bytes utf8bitset = 3;
-}
-
-message BloomFilterIndex {
-  repeated BloomFilter bloomFilter = 1;
-}
-
-message Stream {
-  // if you add new index stream kinds, you need to make sure to update
-  // StreamName to ensure it is added to the stripe in the right area
-  enum Kind {
-    PRESENT = 0;
-    DATA = 1;
-    LENGTH = 2;
-    DICTIONARY_DATA = 3;
-    DICTIONARY_COUNT = 4;
-    SECONDARY = 5;
-    ROW_INDEX = 6;
-    BLOOM_FILTER = 7;
-    BLOOM_FILTER_UTF8 = 8;
-    // Virtual stream kinds to allocate space for encrypted index and data.
-    ENCRYPTED_INDEX = 9;
-    ENCRYPTED_DATA = 10;
-
-    // stripe statistics streams
-    STRIPE_STATISTICS = 100;
-    // A virtual stream kind that is used for setting the encryption IV.
-    FILE_STATISTICS = 101;
-  }
-  optional Kind kind = 1;
-  optional uint32 column = 2;
-  optional uint64 length = 3;
-}
-
-message ColumnEncoding {
-  enum Kind {
-    DIRECT = 0;
-    DICTIONARY = 1;
-    DIRECT_V2 = 2;
-    DICTIONARY_V2 = 3;
-  }
-  optional Kind kind = 1;
-  optional uint32 dictionarySize = 2;
-
-  // The encoding of the bloom filters for this column:
-  //   0 or missing = none or original
-  //   1            = ORC-135 (utc for timestamps)
-  optional uint32 bloomEncoding = 3;
-}
-
-message StripeEncryptionVariant {
-  repeated Stream streams = 1;
-  repeated ColumnEncoding encoding = 2;
-}
-
-// each stripe looks like:
-//   index streams
-//     unencrypted
-//     variant 1..N
-//   data streams
-//     unencrypted
-//     variant 1..N
-//   footer
-
-message StripeFooter {
-  repeated Stream streams = 1;
-  repeated ColumnEncoding columns = 2;
-  optional string writerTimezone = 3;
-  // one for each column encryption variant
-  repeated StripeEncryptionVariant encryption = 4;
-}
-
-// the file tail looks like:
-//   encrypted stripe statistics: ColumnarStripeStatistics (order by variant)
-//   stripe statistics: Metadata
-//   footer: Footer
-//   postscript: PostScript
-//   psLen: byte
-
-message StringPair {
-  optional string key = 1;
-  optional string value = 2;
-}
-
-message Type {
-  enum Kind {
-    BOOLEAN = 0;
-    BYTE = 1;
-    SHORT = 2;
-    INT = 3;
-    LONG = 4;
-    FLOAT = 5;
-    DOUBLE = 6;
-    STRING = 7;
-    BINARY = 8;
-    TIMESTAMP = 9;
-    LIST = 10;
-    MAP = 11;
-    STRUCT = 12;
-    UNION = 13;
-    DECIMAL = 14;
-    DATE = 15;
-    VARCHAR = 16;
-    CHAR = 17;
-    TIMESTAMP_INSTANT = 18;
-  }
-  optional Kind kind = 1;
-  repeated uint32 subtypes = 2 [packed=true];
-  repeated string fieldNames = 3;
-  optional uint32 maximumLength = 4;
-  optional uint32 precision = 5;
-  optional uint32 scale = 6;
-  repeated StringPair attributes = 7;
-}
-
-message StripeInformation {
-  // the global file offset of the start of the stripe
-  optional uint64 offset = 1;
-  // the number of bytes of index
-  optional uint64 indexLength = 2;
-  // the number of bytes of data
-  optional uint64 dataLength = 3;
-  // the number of bytes in the stripe footer
-  optional uint64 footerLength = 4;
-  // the number of rows in this stripe
-  optional uint64 numberOfRows = 5;
-  // If this is present, the reader should use this value for the encryption
-  // stripe id for setting the encryption IV. Otherwise, the reader should
-  // use one larger than the previous stripe's encryptStripeId.
-  // For unmerged ORC files, the first stripe will use 1 and the rest of the
-  // stripes won't have it set. For merged files, the stripe information
-  // will be copied from their original files and thus the first stripe of
-  // each of the input files will reset it to 1.
-  // Note that 1 was choosen, because protobuf v3 doesn't serialize
-  // primitive types that are the default (eg. 0).
-  optional uint64 encryptStripeId = 6;
-  // For each encryption variant, the new encrypted local key to use
-  // until we find a replacement.
-  repeated bytes encryptedLocalKeys = 7;
-}
-
-message UserMetadataItem {
-  optional string name = 1;
-  optional bytes value = 2;
-}
-
-// StripeStatistics (1 per a stripe), which each contain the
-// ColumnStatistics for each column.
-// This message type is only used in ORC v0 and v1.
-message StripeStatistics {
-  repeated ColumnStatistics colStats = 1;
-}
-
-// This message type is only used in ORC v0 and v1.
-message Metadata {
-  repeated StripeStatistics stripeStats = 1;
-}
-
-// In ORC v2 (and for encrypted columns in v1), each column has
-// their column statistics written separately.
-message ColumnarStripeStatistics {
-  // one value for each stripe in the file
-  repeated ColumnStatistics colStats = 1;
-}
-
-enum EncryptionAlgorithm {
-  UNKNOWN_ENCRYPTION = 0;  // used for detecting future algorithms
-  AES_CTR_128 = 1;
-  AES_CTR_256 = 2;
-}
-
-message FileStatistics {
-  repeated ColumnStatistics column = 1;
-}
-
-// How was the data masked? This isn't necessary for reading the file, but
-// is documentation about how the file was written.
-message DataMask {
-  // the kind of masking, which may include third party masks
-  optional string name = 1;
-  // parameters for the mask
-  repeated string maskParameters = 2;
-  // the unencrypted column roots this mask was applied to
-  repeated uint32 columns = 3 [packed = true];
-}
-
-// Information about the encryption keys.
-message EncryptionKey {
-  optional string keyName = 1;
-  optional uint32 keyVersion = 2;
-  optional EncryptionAlgorithm algorithm = 3;
-}
-
-// The description of an encryption variant.
-// Each variant is a single subtype that is encrypted with a single key.
-message EncryptionVariant {
-  // the column id of the root
-  optional uint32 root = 1;
-  // The master key that was used to encrypt the local key, referenced as
-  // an index into the Encryption.key list.
-  optional uint32 key = 2;
-  // the encrypted key for the file footer
-  optional bytes encryptedKey = 3;
-  // the stripe statistics for this variant
-  repeated Stream stripeStatistics = 4;
-  // encrypted file statistics as a FileStatistics
-  optional bytes fileStatistics = 5;
-}
-
-// Which KeyProvider encrypted the local keys.
-enum KeyProviderKind {
-  UNKNOWN = 0;
-  HADOOP = 1;
-  AWS = 2;
-  GCP = 3;
-  AZURE = 4;
-}
-
-message Encryption {
-  // all of the masks used in this file
-  repeated DataMask mask = 1;
-  // all of the keys used in this file
-  repeated EncryptionKey key = 2;
-  // The encrypted variants.
-  // Readers should prefer the first variant that the user has access to
-  // the corresponding key. If they don't have access to any of the keys,
-  // they should get the unencrypted masked data.
-  repeated EncryptionVariant variants = 3;
-  // How are the local keys encrypted?
-  optional KeyProviderKind keyProvider = 4;
-}
-
-enum CalendarKind {
-  UNKNOWN_CALENDAR = 0;
-   // A hybrid Julian/Gregorian calendar with a cutover point in October 1582.
-  JULIAN_GREGORIAN = 1;
-  // A calendar that extends the Gregorian calendar back forever.
-  PROLEPTIC_GREGORIAN = 2;
-}
-
-message Footer {
-  optional uint64 headerLength = 1;
-  optional uint64 contentLength = 2;
-  repeated StripeInformation stripes = 3;
-  repeated Type types = 4;
-  repeated UserMetadataItem metadata = 5;
-  optional uint64 numberOfRows = 6;
-  repeated ColumnStatistics statistics = 7;
-  optional uint32 rowIndexStride = 8;
-
-  // Each implementation that writes ORC files should register for a code
-  // 0 = ORC Java
-  // 1 = ORC C++
-  // 2 = Presto
-  // 3 = Scritchley Go from https://github.com/scritchley/orc
-  // 4 = Trino
-  // 5 = CUDF
-  optional uint32 writer = 9;
-
-  // information about the encryption in this file
-  optional Encryption encryption = 10;
-  optional CalendarKind calendar = 11;
-
-  // informative description about the version of the software that wrote
-  // the file. It is assumed to be within a given writer, so for example
-  // ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT".
-  optional string softwareVersion = 12;
-}
-
-enum CompressionKind {
-  NONE = 0;
-  ZLIB = 1;
-  SNAPPY = 2;
-  LZO = 3;
-  LZ4 = 4;
-  ZSTD = 5;
-}
-
-// Serialized length must be less that 255 bytes
-message PostScript {
-  optional uint64 footerLength = 1;
-  optional CompressionKind compression = 2;
-  optional uint64 compressionBlockSize = 3;
-  // the version of the file format
-  //   [0, 11] = Hive 0.11
-  //   [0, 12] = Hive 0.12
-  repeated uint32 version = 4 [packed = true];
-  optional uint64 metadataLength = 5;
-
-  // The version of the writer that wrote the file. This number is
-  // updated when we make fixes or large changes to the writer so that
-  // readers can detect whether a given bug is present in the data.
-  //
-  // Only the Java ORC writer may use values under 6 (or missing) so that
-  // readers that predate ORC-202 treat the new writers correctly. Each
-  // writer should assign their own sequence of versions starting from 6.
-  //
-  // Version of the ORC Java writer:
-  //   0 = original
-  //   1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
-  //                        string statistics use utf8 for min/max)
-  //   2 = HIVE-4243 fixed (use real column names from Hive tables)
-  //   3 = HIVE-12055 added (vectorized writer implementation)
-  //   4 = HIVE-13083 fixed (decimals write present stream correctly)
-  //   5 = ORC-101 fixed (bloom filters use utf8 consistently)
-  //   6 = ORC-135 fixed (timestamp statistics use utc)
-  //   7 = ORC-517 fixed (decimal64 min/max incorrect)
-  //   8 = ORC-203 added (trim very long string statistics)
-  //   9 = ORC-14 added (column encryption)
-  //
-  // Version of the ORC C++ writer:
-  //   6 = original
-  //
-  // Version of the Presto writer:
-  //   6 = original
-  //
-  // Version of the Scritchley Go writer:
-  //   6 = original
-  //
-  // Version of the Trino writer:
-  //   6 = original
-  //
-  // Version of the CUDF writer:
-  //   6 = original
-  //
-  optional uint32 writerVersion = 6;
-
-  // the number of bytes in the encrypted stripe statistics
-  optional uint64 stripeStatisticsLength = 7;
-
-  // Leave this last in the record
-  optional string magic = 8000;
-}
-
-// The contents of the file tail that must be serialized.
-// This gets serialized as part of OrcSplit, also used by footer cache.
-message FileTail {
-  optional PostScript postscript = 1;
-  optional Footer footer = 2;
-  optional uint64 fileLength = 3;
-  optional uint64 postscriptLength = 4;
-}

Reply via email to