Repository: incubator-gobblin Updated Branches: refs/heads/master a02073e9d -> ab7dfe622
[GOBBLIN-98] Orc records get dropped and duplicated Closes #2283 from PraTrick/GOBBLIN-98 Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/ab7dfe62 Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/ab7dfe62 Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/ab7dfe62 Branch: refs/heads/master Commit: ab7dfe62268c8fbe63a6959fb71cb205f6641f50 Parents: a02073e Author: Prateek Gupta <[email protected]> Authored: Thu Jun 14 16:06:09 2018 -0700 Committer: Abhishek Tiwari <[email protected]> Committed: Thu Jun 14 16:06:09 2018 -0700 ---------------------------------------------------------------------- .../converter/serde/OrcSerDeWrapper.java | 48 ++++++++++++++++++++ .../src/test/resources/serde/serde.properties | 4 +- 2 files changed, 51 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ab7dfe62/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java ---------------------------------------------------------------------- diff --git a/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java b/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java new file mode 100644 index 0000000..bd8a735 --- /dev/null +++ b/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gobblin.converter.serde; + +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.io.Writable; + +import java.util.ArrayList; + +/** + * The Hive's {@link OrcSerde} caches converted records - the {@link OrcSerde} has a single + * {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} and every time the + * {@link org.apache.hadoop.hive.serde2.Serializer#serialize(Object, ObjectInspector)} method is called, the object is + * re-used. + * + * The problem is that {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} is package protected and has no + * public constructor, so no copy can be made. This would be fine if {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} + * is immediately written out. But all Gobblin jobs have a buffer that the writer reads from. This buffering can cause + * race conditions where records get dropped and duplicated. + * + * @author Prateek Gupta + */ + +public class OrcSerDeWrapper extends OrcSerde { + + @Override + public Writable serialize(Object realRow, ObjectInspector inspector) { + Object realRowClone = ObjectInspectorUtils.copyToStandardObject(realRow, inspector); + return super.serialize(realRowClone, inspector); + } +} http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ab7dfe62/gobblin-core/src/test/resources/serde/serde.properties ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/serde/serde.properties b/gobblin-core/src/test/resources/serde/serde.properties index 496ddc6..df42d53 100644 --- a/gobblin-core/src/test/resources/serde/serde.properties +++ b/gobblin-core/src/test/resources/serde/serde.properties @@ -17,7 +17,9 @@ avro.schema.url=gobblin-core/src/test/resources/serde/serde.avsc source.hadoop.file.input.paths=gobblin-core/src/test/resources/serde/serde.avro -serde.serializer.type=ORC +serde.serializer.type=gobblin.converter.serde.OrcSerDeWrapper +serde.serializer.input.format.type=org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +serde.serializer.output.format.type=org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat serde.deserializer.type=AVRO writer.staging.dir=gobblin-core/src/test/resources/serde/output-staging writer.output.dir=gobblin-core/src/test/resources/serde/output
