[ https://issues.apache.org/jira/browse/PARQUET-1135?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16393851#comment-16393851 ]
ASF GitHub Bot commented on PARQUET-1135: ----------------------------------------- julienledem closed pull request #427: PARQUET-1135: upgrade thrift and protobuf dependencies URL: https://github.com/apache/parquet-mr/pull/427 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/.travis.yml b/.travis.yml index 55f6e9afa..ef02f9b9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,34 +1,37 @@ language: java before_install: + - date - sudo apt-get update -qq - sudo apt-get install build-essential + - date - mkdir protobuf_install - pushd protobuf_install - - wget https://github.com/google/protobuf/archive/v3.2.0.tar.gz -O protobuf-3.2.0.tar.gz - - tar xzf protobuf-3.2.0.tar.gz - - cd protobuf-3.2.0 + - wget https://github.com/google/protobuf/archive/v3.5.1.tar.gz -O protobuf-3.5.1.tar.gz + - tar xzf protobuf-3.5.1.tar.gz + - cd protobuf-3.5.1 - sudo apt-get install autoconf automake libtool curl make g++ unzip - ./autogen.sh - ./configure - make - - make check - sudo make install - sudo ldconfig - protoc --version - popd + - date - pwd - sudo apt-get install -qq libboost-dev libboost-test-dev libboost-program-options-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev - - wget -nv http://archive.apache.org/dist/thrift/0.7.0/thrift-0.7.0.tar.gz - - tar zxf thrift-0.7.0.tar.gz - - cd thrift-0.7.0 + - wget -nv http://archive.apache.org/dist/thrift/0.9.3/thrift-0.9.3.tar.gz + - tar zxf thrift-0.9.3.tar.gz + - cd thrift-0.9.3 - chmod +x ./configure - - ./configure --disable-gen-erl --disable-gen-hs --without-ruby --without-haskell --without-erlang --without-php + - ./configure --disable-gen-erl --disable-gen-hs --without-ruby --without-haskell --without-erlang --without-php --without-nodejs - sudo make install - cd .. + - date env: - HADOOP_PROFILE=default TEST_CODECS=uncompressed,brotli - HADOOP_PROFILE=default TEST_CODECS=gzip,snappy -install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true > mvn_install.log || mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true > mvn_install.log || (cat mvn_install.log && false) +install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true > mvn_install.log || (cat mvn_install.log && false) script: mvn test -P $HADOOP_PROFILE diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml index 979b4369d..5bab770b8 100644 --- a/parquet-protobuf/pom.xml +++ b/parquet-protobuf/pom.xml @@ -31,7 +31,7 @@ <properties> <elephant-bird.version>4.4</elephant-bird.version> - <protobuf.version>3.2.0</protobuf.version> + <protobuf.version>3.5.1</protobuf.version> <!-- allow using protoc from an alternative path --> <protoc.path>protoc</protoc.path> </properties> diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java index ba46c84b3..6f5d50d48 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java @@ -19,6 +19,8 @@ package org.apache.parquet.hadoop.thrift; import java.io.ByteArrayInputStream; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; @@ -28,7 +30,8 @@ import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.protocol.TProtocolFactory; import org.apache.thrift.transport.TIOStreamTransport; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.parquet.hadoop.BadConfigurationException; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.io.ColumnIOFactory; @@ -45,6 +48,7 @@ import org.apache.parquet.thrift.struct.ThriftType.StructType; public class ThriftBytesWriteSupport extends WriteSupport<BytesWritable> { + private static final Logger LOG = LoggerFactory.getLogger(ThriftBytesWriteSupport.class); private static final String PARQUET_PROTOCOL_CLASS = "parquet.protocol.class"; public static <U extends TProtocol> void setTProtocolClass(Configuration conf, Class<U> tProtocolClass) { @@ -123,8 +127,32 @@ public WriteContext init(Configuration configuration) { return thriftWriteSupport.init(configuration); } + private static Method SET_READ_LENGTH; + static { + try { + SET_READ_LENGTH = TBinaryProtocol.class.getMethod("setReadLength", int.class); + } catch (NoSuchMethodException e) { + SET_READ_LENGTH = null; + } + } + private TProtocol protocol(BytesWritable record) { - return protocolFactory.getProtocol(new TIOStreamTransport(new ByteArrayInputStream(record.getBytes()))); + TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(new ByteArrayInputStream(record.getBytes()))); + + /* Reduce the chance of OOM when data is corrupted. When readBinary is called on TBinaryProtocol, it reads the length of the binary first, + so if the data is corrupted, it could read a big integer as the length of the binary and therefore causes OOM to happen. + Currently this fix only applies to TBinaryProtocol which has the setReadLength defined (thrift 0.7). + */ + if (SET_READ_LENGTH != null && protocol instanceof TBinaryProtocol) { + try { + SET_READ_LENGTH.invoke(protocol, new Object[]{record.getLength()}); + } catch (IllegalAccessException | IllegalArgumentException | InvocationTargetException e) { + LOG.warn("setReadLength should not throw an exception", e); + SET_READ_LENGTH = null; + } + } + + return protocol; } @Override diff --git a/pom.xml b/pom.xml index 8a8e91ac8..c8c8ccf44 100644 --- a/pom.xml +++ b/pom.xml @@ -90,8 +90,8 @@ <scala.maven.test.skip>false</scala.maven.test.skip> <pig.version>0.16.0</pig.version> <pig.classifier>h2</pig.classifier> - <thrift.version>0.7.0</thrift.version> <thrift-maven-plugin.version>0.10.0</thrift-maven-plugin.version> + <thrift.version>0.9.3</thrift.version> <fastutil.version>7.0.13</fastutil.version> <semver.api.version>0.9.33</semver.api.version> <slf4j.version>1.7.22</slf4j.version> ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > upgrade thrift and protobuf dependencies > ---------------------------------------- > > Key: PARQUET-1135 > URL: https://issues.apache.org/jira/browse/PARQUET-1135 > Project: Parquet > Issue Type: Improvement > Components: parquet-mr > Reporter: Julien Le Dem > Assignee: Julien Le Dem > Priority: Major > > thrift 0.7.0 -> 0.9.3 > protobuf 3.2 -> 3.4 -- This message was sent by Atlassian JIRA (v7.6.3#76005)