[ 
https://issues.apache.org/jira/browse/PARQUET-1135?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16393851#comment-16393851
 ] 

ASF GitHub Bot commented on PARQUET-1135:
-----------------------------------------

julienledem closed pull request #427: PARQUET-1135: upgrade thrift and protobuf 
dependencies
URL: https://github.com/apache/parquet-mr/pull/427
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.travis.yml b/.travis.yml
index 55f6e9afa..ef02f9b9b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,34 +1,37 @@
 language: java
 before_install:
+  - date
   - sudo apt-get update -qq
   - sudo apt-get install build-essential
+  - date
   - mkdir protobuf_install
   - pushd protobuf_install
-  - wget https://github.com/google/protobuf/archive/v3.2.0.tar.gz -O 
protobuf-3.2.0.tar.gz
-  - tar xzf protobuf-3.2.0.tar.gz
-  - cd protobuf-3.2.0
+  - wget https://github.com/google/protobuf/archive/v3.5.1.tar.gz -O 
protobuf-3.5.1.tar.gz
+  - tar xzf protobuf-3.5.1.tar.gz
+  - cd protobuf-3.5.1
   - sudo apt-get install autoconf automake libtool curl make g++ unzip
   - ./autogen.sh
   - ./configure
   - make
-  - make check
   - sudo make install
   - sudo ldconfig
   - protoc --version
   - popd
+  - date
   - pwd
   - sudo apt-get install -qq libboost-dev libboost-test-dev 
libboost-program-options-dev libevent-dev automake libtool flex bison 
pkg-config g++ libssl-dev
-  - wget -nv http://archive.apache.org/dist/thrift/0.7.0/thrift-0.7.0.tar.gz
-  - tar zxf thrift-0.7.0.tar.gz
-  - cd thrift-0.7.0
+  - wget -nv http://archive.apache.org/dist/thrift/0.9.3/thrift-0.9.3.tar.gz
+  - tar zxf thrift-0.9.3.tar.gz
+  - cd thrift-0.9.3
   - chmod +x ./configure
-  - ./configure --disable-gen-erl --disable-gen-hs --without-ruby 
--without-haskell --without-erlang --without-php
+  - ./configure --disable-gen-erl --disable-gen-hs --without-ruby 
--without-haskell --without-erlang --without-php --without-nodejs
   - sudo make install
   - cd ..
+  - date
 
 env:
   - HADOOP_PROFILE=default TEST_CODECS=uncompressed,brotli
   - HADOOP_PROFILE=default TEST_CODECS=gzip,snappy
 
-install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true 
-Dsource.skip=true > mvn_install.log || mvn install --batch-mode 
-DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true > mvn_install.log 
|| (cat mvn_install.log && false)
+install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true 
-Dsource.skip=true > mvn_install.log || (cat mvn_install.log && false)
 script: mvn test -P $HADOOP_PROFILE
diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml
index 979b4369d..5bab770b8 100644
--- a/parquet-protobuf/pom.xml
+++ b/parquet-protobuf/pom.xml
@@ -31,7 +31,7 @@
 
   <properties>
     <elephant-bird.version>4.4</elephant-bird.version>
-    <protobuf.version>3.2.0</protobuf.version>
+    <protobuf.version>3.5.1</protobuf.version>
     <!-- allow using protoc from an alternative path -->
     <protoc.path>protoc</protoc.path>
   </properties>
diff --git 
a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
 
b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
index ba46c84b3..6f5d50d48 100644
--- 
a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
+++ 
b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java
@@ -19,6 +19,8 @@
 package org.apache.parquet.hadoop.thrift;
 
 import java.io.ByteArrayInputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.BytesWritable;
@@ -28,7 +30,8 @@
 import org.apache.thrift.protocol.TProtocol;
 import org.apache.thrift.protocol.TProtocolFactory;
 import org.apache.thrift.transport.TIOStreamTransport;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.apache.parquet.hadoop.BadConfigurationException;
 import org.apache.parquet.hadoop.api.WriteSupport;
 import org.apache.parquet.io.ColumnIOFactory;
@@ -45,6 +48,7 @@
 import org.apache.parquet.thrift.struct.ThriftType.StructType;
 
 public class ThriftBytesWriteSupport extends WriteSupport<BytesWritable> {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ThriftBytesWriteSupport.class);
   private static final String PARQUET_PROTOCOL_CLASS = 
"parquet.protocol.class";
 
   public static <U extends TProtocol> void setTProtocolClass(Configuration 
conf, Class<U> tProtocolClass) {
@@ -123,8 +127,32 @@ public WriteContext init(Configuration configuration) {
     return thriftWriteSupport.init(configuration);
   }
 
+  private static Method SET_READ_LENGTH;
+  static {
+    try {
+      SET_READ_LENGTH = TBinaryProtocol.class.getMethod("setReadLength", 
int.class);
+    } catch (NoSuchMethodException e) {
+      SET_READ_LENGTH = null;
+    }
+  }
+
   private TProtocol protocol(BytesWritable record) {
-    return protocolFactory.getProtocol(new TIOStreamTransport(new 
ByteArrayInputStream(record.getBytes())));
+    TProtocol protocol = protocolFactory.getProtocol(new 
TIOStreamTransport(new ByteArrayInputStream(record.getBytes())));
+
+    /* Reduce the chance of OOM when data is corrupted. When readBinary is 
called on TBinaryProtocol, it reads the length of the binary first,
+     so if the data is corrupted, it could read a big integer as the length of 
the binary and therefore causes OOM to happen.
+     Currently this fix only applies to TBinaryProtocol which has the 
setReadLength defined (thrift 0.7).
+      */
+    if (SET_READ_LENGTH != null && protocol instanceof TBinaryProtocol) {
+      try {
+        SET_READ_LENGTH.invoke(protocol, new Object[]{record.getLength()});
+      } catch (IllegalAccessException | IllegalArgumentException | 
InvocationTargetException e) {
+        LOG.warn("setReadLength should not throw an exception", e);
+        SET_READ_LENGTH = null;
+      }
+    }
+
+    return protocol;
   }
 
   @Override
diff --git a/pom.xml b/pom.xml
index 8a8e91ac8..c8c8ccf44 100644
--- a/pom.xml
+++ b/pom.xml
@@ -90,8 +90,8 @@
     <scala.maven.test.skip>false</scala.maven.test.skip>
     <pig.version>0.16.0</pig.version>
     <pig.classifier>h2</pig.classifier>
-    <thrift.version>0.7.0</thrift.version>
     <thrift-maven-plugin.version>0.10.0</thrift-maven-plugin.version>
+    <thrift.version>0.9.3</thrift.version>
     <fastutil.version>7.0.13</fastutil.version>
     <semver.api.version>0.9.33</semver.api.version>
     <slf4j.version>1.7.22</slf4j.version>


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> upgrade thrift and protobuf dependencies
> ----------------------------------------
>
>                 Key: PARQUET-1135
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1135
>             Project: Parquet
>          Issue Type: Improvement
>          Components: parquet-mr
>            Reporter: Julien Le Dem
>            Assignee: Julien Le Dem
>            Priority: Major
>
> thrift 0.7.0 -> 0.9.3
> protobuf 3.2 -> 3.4



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to