Repository: parquet-mr Updated Branches: refs/heads/master 2f956f465 -> 3f36b7b50
PARQUET-362 - Fix parquet buffered writer being oversensitive to union schema changes Parquet does prevent records with unknown union fields to be written as it would create a TProtocol violation. But it also prevents records with unions having one their field itself having an unknown field (which is acceptable if it is a struct). Author: Laurent Goujon <[email protected]> Closes #262 from laurentgo/fix-parquet-union-write-bug and squashes the following commits: d15ee74 [Laurent Goujon] Fix parquet buffered writer being oversentive to union changes Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/3f36b7b5 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/3f36b7b5 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/3f36b7b5 Branch: refs/heads/master Commit: 3f36b7b50bdda3eeca632ad5440bb82b8e34cb40 Parents: 2f956f4 Author: Laurent Goujon <[email protected]> Authored: Thu Aug 20 13:52:56 2015 -0700 Committer: Alex Levenson <[email protected]> Committed: Thu Aug 20 13:52:56 2015 -0700 ---------------------------------------------------------------------- .../thrift/BufferedProtocolReadToWrite.java | 9 +++-- .../parquet/thrift/TestProtocolReadToWrite.java | 37 ++++++++++++++++++++ parquet-thrift/src/test/thrift/compat.thrift | 8 +++++ 3 files changed, 49 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3f36b7b5/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java index 70bd003..9fce1c3 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java @@ -375,6 +375,9 @@ public class BufferedProtocolReadToWrite implements ProtocolPipe { hasFieldsIgnored |= true; continue; } + + childFieldsPresent++; + buffer.add(new Action() { @Override public void write(TProtocol out) throws TException { @@ -386,11 +389,7 @@ public class BufferedProtocolReadToWrite implements ProtocolPipe { return "f=" + currentField.id + "<t=" + typeName(currentField.type) + ">: "; } }); - boolean wasIgnored = readOneValue(in, field.type, buffer, expectedField.getType()); - if (!wasIgnored) { - childFieldsPresent++; - } - hasFieldsIgnored |= wasIgnored; + hasFieldsIgnored |= readOneValue(in, field.type, buffer, expectedField.getType()); in.readFieldEnd(); buffer.add(FIELD_END); } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3f36b7b5/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java index ba27166..bdd3a3f 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java @@ -220,6 +220,43 @@ public class TestProtocolReadToWrite { assertEquals(0, countingHandler.fieldIgnoredCount); } + @Test + public void testUnionWithStructWithUnknownField() throws Exception { + CountingErrorHandler countingHandler = new CountingErrorHandler(); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(UnionV3.class), countingHandler); + ByteArrayOutputStream in = new ByteArrayOutputStream(); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + + UnionV3 validUnion = UnionV3.aStruct(new StructV1("a valid struct")); + StructV2 structV2 = new StructV2("a valid struct"); + structV2.setAge("a valid age"); + UnionThatLooksLikeUnionV3 unionWithUnknownStructField = UnionThatLooksLikeUnionV3.aStruct(structV2); + + validUnion.write(protocol(in)); + unionWithUnknownStructField.write(protocol(in)); + + ByteArrayInputStream baos = new ByteArrayInputStream(in.toByteArray()); + + // both should not throw + p.readOne(protocol(baos), protocol(out)); + p.readOne(protocol(baos), protocol(out)); + + assertEquals(1, countingHandler.recordCountOfMissingFields); + assertEquals(1, countingHandler.fieldIgnoredCount); + + in = new ByteArrayOutputStream(); + validUnion.write(protocol(in)); + unionWithUnknownStructField.write(protocol(in)); + + baos = new ByteArrayInputStream(in.toByteArray()); + + // both should not throw + p.readOne(protocol(baos), protocol(out)); + p.readOne(protocol(baos), protocol(out)); + + assertEquals(2, countingHandler.recordCountOfMissingFields); + assertEquals(2, countingHandler.fieldIgnoredCount); + } /** * When enum value in data has an undefined index, it's considered as corrupted record and will be skipped. http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/3f36b7b5/parquet-thrift/src/test/thrift/compat.thrift ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/thrift/compat.thrift b/parquet-thrift/src/test/thrift/compat.thrift index 8c90e23..3316576 100644 --- a/parquet-thrift/src/test/thrift/compat.thrift +++ b/parquet-thrift/src/test/thrift/compat.thrift @@ -152,6 +152,10 @@ union UnionV2 { 3: ABool aNewBool } +union UnionV3 { + 1: StructV1 aStruct +} + struct StructWithUnionV1 { 1: required string name, 2: required UnionV1 aUnion @@ -173,6 +177,10 @@ struct StructWithAStructThatLooksLikeUnionV2 { 2: required AStructThatLooksLikeUnionV2 aNotQuiteUnion } +union UnionThatLooksLikeUnionV3 { + 1: StructV2 aStruct +} + union UnionOfStructs { 1: StructV3 structV3, 2: StructV4WithExtracStructField structV4,
