[orc] branch main updated: ORC-1361: InvalidProtocolBufferException when reading large stripe statistics

gangwu Thu, 23 Feb 2023 18:11:29 -0800

This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new c5d7755ba ORC-1361: InvalidProtocolBufferException when reading large 
stripe statistics
c5d7755ba is described below

commit c5d7755ba0b9a95631c8daea4d094101f26ec761
Author: Stamatis Zampetakis <[email protected]>
AuthorDate: Fri Feb 24 03:10:52 2023 +0100

    ORC-1361: InvalidProtocolBufferException when reading large stripe 
statistics
    
    ### What changes were proposed in this pull request?
    Catch `InvalidProtocolBufferException` when parsing stripe statistics, log 
a warning message, and return an empty list.
    
    ### Why are the changes needed?
    In some cases ORC files may be created with very large Metadata section due 
to stripe statistics. The bigger the ORC file gets the easier is to endup with 
a large Metadata section. Nevertheless, it is still possible to hit the problem 
with smaller ORC files and `TestOrcWithLargeStripeStatistics` demonstrates some 
extremes.
    
    Any attempt to read back the stripe statistics from the file will fail with 
`InvalidProtocolBufferException`. The exact exception may differ slighly and 
depending on: a) the size of stripe statistics; b) protobuf size limit.
    
    Stripe statistics less than 2GB, and protobuf limit less than stats size:
    ```
    com.google.protobuf.InvalidProtocolBufferException: Protocol message was 
too large.  May be malicious.  Use CodedInputStream.setSizeLimit() to increase 
the size limit.
    ```
    
    Stripe statistics greater than 2GB, and protobuf limit 2GB 
(`Integer.MAX_VALUE`):
    ```
    com.google.protobuf.InvalidProtocolBufferException: While parsing a 
protocol message, the input ended unexpectedly in the middle of a field.  This 
could mean either that the input has been truncated or that an embedded message 
misreported its own length.
    ```
    
    The `Protocol message was too large` problem could be alleviated by 
increasing the limit as it was done in the past. However, increasing the limit 
would hide the underlying problem and probably lead to permanent metadata 
corruption in the near future. Moreover, the limit is already high enough (1GB) 
and bumping it further would lead to more memory being used potentially 
triggering `OutOfMemoryError`, OOM Killer, GC pauses, and other problems that 
are usually harder to debug and find t [...]
    
    When the stripe statistics exceeds the 2GB there is nothing to be done to 
parse back the statistics since protobuf cannot deserialize such messages 
(https://github.com/protocolbuffers/protobuf/issues/11729). The problem cannot 
be solved unless the metadata storage changes but this can only happen in newer 
versions.
    
    On the other hand, stripe statistics are important but not vital for 
readers. In those situations where limits are breached it is acceptable to log 
a warning and return nothing instead of raising a fatal error to the caller.
    
    ### How was this patch tested?
    Existing unit tests plus new tests added in 
`TestOrcWithLargeStripeStatistics`.
    
    This closes #1402
---
 .../src/java/org/apache/orc/impl/ReaderImpl.java   |  16 ++-
 .../orc/TestOrcWithLargeStripeStatistics.java      | 115 +++++++++++++++++++++
 2 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index 6a4830c38..f2e150cea 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -19,6 +19,7 @@
 package org.apache.orc.impl;
 
 import com.google.protobuf.CodedInputStream;
+import com.google.protobuf.InvalidProtocolBufferException;
 import com.google.protobuf.TextFormat;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@@ -55,6 +56,7 @@ import java.nio.ByteBuffer;
 import java.security.Key;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Objects;
 import java.util.function.Supplier;
@@ -1035,11 +1037,15 @@ public class ReaderImpl implements Reader {
       long offset,
       int length,
       InStream.StreamOptions options) throws IOException {
-    InStream stream = InStream.create("stripe stats", tailBuffer, offset,
-        length, options);
-    OrcProto.Metadata meta = OrcProto.Metadata.parseFrom(
-        InStream.createCodedInputStream(stream));
-    return meta.getStripeStatsList();
+    try (InStream stream = InStream.create("stripe stats", tailBuffer, offset,
+        length, options)) {
+      OrcProto.Metadata meta = OrcProto.Metadata.parseFrom(
+          InStream.createCodedInputStream(stream));
+      return meta.getStripeStatsList();
+    } catch (InvalidProtocolBufferException e) {
+      LOG.warn("Failed to parse stripe statistics", e);
+      return Collections.emptyList();
+    }
   }
 
   private List<StripeStatistics> 
convertFromProto(List<OrcProto.StripeStatistics> list) {
diff --git 
a/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java 
b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java
new file mode 100644
index 000000000..a766cb01d
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Tests for operations on Orc file with very large stripe statistics.
+ * <p>
+ * The test is disabled by default cause it is rather slow (approx 14 minutes) 
and memory greedy
+ * (it requires about 4g heap space when creating the files). If you want to 
run it remove the
+ * {@code Disabled} annotation and ensure that max heap (Xmx) is at least 4g.
+ * </p>
+ */
+@Disabled("ORC-1361")
+public class TestOrcWithLargeStripeStatistics {
+
+  @ParameterizedTest
+  @EnumSource(value = OrcFile.Version.class, mode = EnumSource.Mode.EXCLUDE, 
names = "FUTURE")
+  public void 
testGetStripeStatisticsNoProtocolBufferExceptions(OrcFile.Version version)
+      throws Exception {
+    // Use a size that exceeds the protobuf limit (e.g., 1GB) to trigger 
protobuf exception
+    Path p = createOrcFile(1024L << 20, version);
+    try (Reader reader = OrcFile.createReader(p, OrcFile.readerOptions(new 
Configuration()))) {
+      assertTrue(reader.getStripeStatistics().isEmpty());
+    }
+  }
+
+  /**
+   * Creates an Orc file with a metadata section of the specified size and 
return its path in the
+   * filesystem.
+   * 
+   * The file has a fixed schema (500 string columns) and content (every 
column contains 200
+   * characters, which is roughly 200 bytes). Each row is roughly 100KB 
uncompressed and each stripe
+   * holds exactly one row thus stripe metadata (column statistics) per row is 
200KB (100KB for min,
+   * 100KB for max, few bytes for sum).
+   * 
+   * @param metadataSize the desired size of the resulting metadata section in 
bytes
+   * @param version the desired version to create the file
+   * @return the path to filesystem where the file was created.
+   * @throws IOException if an IO problem occurs while creating the file
+   */
+  private static Path createOrcFile(long metadataSize, OrcFile.Version 
version) throws IOException {
+    // Calculate the number of rows/stripes to create based on the size of one 
row (200KB).
+    final long ROW_STRIPE_NUM = metadataSize / 200_000L;
+    Path p = new Path(System.getProperty("test.tmp.dir"),
+        TestOrcWithLargeStripeStatistics.class.getSimpleName()
+            + "_" + ROW_STRIPE_NUM + "_" + version + ".orc");
+    // Modify defaults to force one row per stripe.
+    Configuration conf = new Configuration();
+    conf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "0");
+    TypeDescription schema = createTypeDescription();
+    OrcFile.WriterOptions writerOptions =
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(1)
+            .encodingStrategy(OrcFile.EncodingStrategy.SPEED)
+            .version(version);
+    try (Writer writer = OrcFile.createWriter(p, writerOptions)) {
+      VectorizedRowBatch batch = createSingleRowBatch(schema);
+      for (long i = 0; i < ROW_STRIPE_NUM; i++) {
+        writer.addRowBatch(batch);
+      }
+    }
+    return p;
+  }
+
+  private static VectorizedRowBatch createSingleRowBatch(TypeDescription 
schema) {
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.size = 1;
+    byte[] bigString = new byte[200];
+    Arrays.fill(bigString, (byte) 'A');
+    for (int i = 0; i < batch.numCols; i++) {
+      BytesColumnVector col = (BytesColumnVector) batch.cols[i];
+      col.setVal(0, bigString);
+    }
+    return batch;
+  }
+
+  private static TypeDescription createTypeDescription() {
+    String strCols = IntStream.range(0, 500)
+        .mapToObj(i -> "col" + i + ":string")
+        .collect(Collectors.joining(","));
+    return TypeDescription.fromString("struct<" + strCols + ">");
+  }
+
+}

[orc] branch main updated: ORC-1361: InvalidProtocolBufferException when reading large stripe statistics

Reply via email to