Repository: parquet-mr Updated Branches: refs/heads/master 454fc3655 -> b86f68e39
PARQUET-346: Minor fixes for PARQUET-350, PARQUET-348, PARQUET-346, PARQUET-345 PARQUET-346: ThriftSchemaConverter throws for unknown struct or union type This is triggered when passing a StructType that comes from old file metadata PARQUET-350: ThriftRecordConverter throws NPE for unrecognized enum values This is just some better error reporting. PARQUET-348: shouldIgnoreStatistics too noisy This is just a case of way over logging something, to the point that it make the logs unreadable PARQUET-345 ThriftMetaData toString() should not try to load class reflectively This is a case where the error reporting itself crashes, which results in the real error message getting lost Author: Alex Levenson <[email protected]> Closes #252 from isnotinvain/alexlevenson/various-fixes and squashes the following commits: 9b5cb0e [Alex Levenson] Add comments, cleanup some minor use of ThriftSchemaConverter 376343e [Alex Levenson] Fix test d9d5dad [Alex Levenson] add license headers e26dc0c [Alex Levenson] Add tests 8d9dde0 [Alex Levenson] Fixes for PARQUET-350, PARQUET-348, PARQUET-346, PARQUET-345 Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/b86f68e3 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/b86f68e3 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/b86f68e3 Branch: refs/heads/master Commit: b86f68e39dc7b6a7c2bff1e4fea3bb7c28d103f0 Parents: 454fc36 Author: Alex Levenson <[email protected]> Authored: Fri Jul 31 16:57:19 2015 -0700 Committer: Alex Levenson <[email protected]> Committed: Fri Jul 31 16:57:19 2015 -0700 ---------------------------------------------------------------------- .../org/apache/parquet/CorruptStatistics.java | 30 ++++-- .../scrooge/ScroogeStructConverterTest.java | 5 +- .../thrift/AbstractThriftWriteSupport.java | 3 +- .../hadoop/thrift/TBaseWriteSupport.java | 3 +- .../hadoop/thrift/ThriftBytesWriteSupport.java | 5 +- .../apache/parquet/thrift/ThriftMetaData.java | 3 +- .../parquet/thrift/ThriftRecordConverter.java | 49 +++++---- .../thrift/ThriftSchemaConvertVisitor.java | 23 +++-- .../parquet/thrift/ThriftSchemaConverter.java | 16 ++- .../thrift/struct/CompatibilityRunner.java | 2 +- .../parquet/thrift/TestProtocolReadToWrite.java | 14 +-- .../parquet/thrift/TestThriftMetaData.java | 55 ++++++++++ .../thrift/TestThriftRecordConverter.java | 101 +++++++++++++++++++ .../thrift/TestThriftToPigCompatibility.java | 2 +- .../thrift/struct/CompatibilityCheckerTest.java | 2 +- .../StructWithUnionV1NoStructOrUnionMeta.json | 49 +++++++++ 16 files changed, 303 insertions(+), 59 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java index e2b8114..3869cda 100644 --- a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java @@ -18,6 +18,8 @@ */ package org.apache.parquet; +import java.util.concurrent.atomic.AtomicBoolean; + import org.apache.parquet.SemanticVersion.SemanticVersionParseException; import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.VersionParser.VersionParseException; @@ -31,6 +33,8 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; * and thus it's statistics should be ignored / not trusted. */ public class CorruptStatistics { + private static final AtomicBoolean alreadyLogged = new AtomicBoolean(false); + private static final Log LOG = Log.getLog(CorruptStatistics.class); // the version in which the bug described by jira: PARQUET-251 was fixed @@ -52,7 +56,7 @@ public class CorruptStatistics { if (Strings.isNullOrEmpty(createdBy)) { // created_by is not populated, which could have been caused by // parquet-mr during the same time as PARQUET-251, see PARQUET-297 - LOG.info("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297"); + warnOnce("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297"); return true; } @@ -65,16 +69,16 @@ public class CorruptStatistics { } if (Strings.isNullOrEmpty(version.version)) { - LOG.warn("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy); + warnOnce("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy); return true; } SemanticVersion semver = SemanticVersion.parse(version.version); if (semver.compareTo(PARQUET_251_FIXED_VERSION) < 0) { - LOG.info("Ignoring statistics because this file was created prior to " + warnOnce("Ignoring statistics because this file was created prior to " + PARQUET_251_FIXED_VERSION - + ", see PARQUET-251" ); + + ", see PARQUET-251"); return true; } @@ -83,22 +87,30 @@ public class CorruptStatistics { } catch (RuntimeException e) { // couldn't parse the created_by field, log what went wrong, don't trust the stats, // but don't make this fatal. - warnParseError(createdBy, e); + warnParseErrorOnce(createdBy, e); return true; } catch (SemanticVersionParseException e) { // couldn't parse the created_by field, log what went wrong, don't trust the stats, // but don't make this fatal. - warnParseError(createdBy, e); + warnParseErrorOnce(createdBy, e); return true; } catch (VersionParseException e) { // couldn't parse the created_by field, log what went wrong, don't trust the stats, // but don't make this fatal. - warnParseError(createdBy, e); + warnParseErrorOnce(createdBy, e); return true; } } - private static void warnParseError(String createdBy, Throwable e) { - LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e); + private static void warnParseErrorOnce(String createdBy, Throwable e) { + if(!alreadyLogged.getAndSet(true)) { + LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e); + } + } + + private static void warnOnce(String message) { + if(!alreadyLogged.getAndSet(true)) { + LOG.warn(message); + } } } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java ---------------------------------------------------------------------- diff --git a/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java b/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java index 648634c..8acbf96 100644 --- a/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java +++ b/parquet-scrooge/src/test/java/org/apache/parquet/scrooge/ScroogeStructConverterTest.java @@ -61,15 +61,14 @@ public class ScroogeStructConverterTest { */ private void shouldConvertConsistentlyWithThriftStructConverter(Class scroogeClass) throws ClassNotFoundException { Class<? extends TBase<?, ?>> thriftClass = (Class<? extends TBase<?, ?>>)Class.forName(scroogeClass.getName().replaceFirst("org.apache.parquet.scrooge.test", "org.apache.parquet.thrift.test")); - ThriftType.StructType structFromThriftSchemaConverter = new ThriftSchemaConverter().toStructType(thriftClass); + ThriftType.StructType structFromThriftSchemaConverter = ThriftSchemaConverter.toStructType(thriftClass); ThriftType.StructType structFromScroogeSchemaConverter = new ScroogeStructConverter().convert(scroogeClass); assertEquals(toParquetSchema(structFromThriftSchemaConverter), toParquetSchema(structFromScroogeSchemaConverter)); } private MessageType toParquetSchema(ThriftType.StructType struct) { - ThriftSchemaConverter sc = new ThriftSchemaConverter(); - return sc.convert(struct); + return ThriftSchemaConverter.convertWithoutProjection(struct); } @Test http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java index 91350cc..5f210d3 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/AbstractThriftWriteSupport.java @@ -84,8 +84,7 @@ public abstract class AbstractThriftWriteSupport<T> extends WriteSupport<T> { this.thriftClass = thriftClass; this.thriftStruct = getThriftStruct(); - ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter(); - this.schema = thriftSchemaConverter.convert(thriftStruct); + this.schema = ThriftSchemaConverter.convertWithoutProjection(thriftStruct); final Map<String, String> extraMetaData = new ThriftMetaData(thriftClass.getName(), thriftStruct).toExtraMetaData(); // adding the Pig schema as it would have been mapped from thrift http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java index 2b3b310..b457278 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java @@ -47,8 +47,7 @@ public class TBaseWriteSupport<T extends TBase<?, ?>> extends AbstractThriftWrit @Override protected StructType getThriftStruct() { - ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter(); - return thriftSchemaConverter.toStructType((Class<TBase<?, ?>>)thriftClass); + return ThriftSchemaConverter.toStructType(thriftClass); } @Override http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java index 6caecbc..6db769e 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java @@ -108,9 +108,8 @@ public class ThriftBytesWriteSupport extends WriteSupport<BytesWritable> { } else { thriftClass = TBaseWriteSupport.getThriftClass(configuration); } - ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter(); - this.thriftStruct = thriftSchemaConverter.toStructType(thriftClass); - this.schema = thriftSchemaConverter.convert(thriftStruct); + this.thriftStruct = ThriftSchemaConverter.toStructType(thriftClass); + this.schema = ThriftSchemaConverter.convertWithoutProjection(thriftStruct); if (buffered) { readToWrite = new BufferedProtocolReadToWrite(thriftStruct, errorHandler); } else { http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java index f0a9624..a89f8d9 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftMetaData.java @@ -128,7 +128,6 @@ public class ThriftMetaData { @Override public String toString() { - return "ThriftMetaData" + toExtraMetaData(); + return String.format("ThriftMetaData(thriftClassName: %s, descriptor: %s)", thriftClassName, descriptor); } - } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java index ec0f4ff..e18b0e6 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftRecordConverter.java @@ -24,6 +24,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.parquet.io.ParquetDecodingException; import org.apache.thrift.TException; import org.apache.thrift.protocol.TField; import org.apache.thrift.protocol.TList; @@ -62,7 +63,7 @@ import org.apache.parquet.thrift.struct.ThriftTypeID; */ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { - final ParquetProtocol readFieldEnd = new ParquetProtocol("readFieldEnd()") { + final static ParquetProtocol readFieldEnd = new ParquetProtocol("readFieldEnd()") { @Override public void readFieldEnd() throws TException { } @@ -75,7 +76,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class PrimitiveFieldHandler extends PrimitiveConverter { + static class PrimitiveFieldHandler extends PrimitiveConverter { private final PrimitiveConverter delegate; private final List<TProtocol> events; @@ -154,7 +155,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class GroupFieldhandler extends GroupConverter { + static class GroupFieldhandler extends GroupConverter { private final GroupConverter delegate; private final List<TProtocol> events; @@ -203,7 +204,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class GroupCounter extends GroupConverter implements Counter { + static class GroupCounter extends GroupConverter implements Counter { private final GroupConverter delegate; private int count; @@ -246,7 +247,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class PrimitiveCounter extends PrimitiveConverter implements Counter { + static class PrimitiveCounter extends PrimitiveConverter implements Counter { private final PrimitiveConverter delegate; private int count; @@ -309,7 +310,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class FieldPrimitiveConverter extends PrimitiveConverter { + static class FieldPrimitiveConverter extends PrimitiveConverter { private final List<TProtocol> events; private ThriftTypeID type; @@ -400,7 +401,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class FieldStringConverter extends PrimitiveConverter { + static class FieldStringConverter extends PrimitiveConverter { private final List<TProtocol> events; @@ -429,14 +430,15 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class FieldEnumConverter extends PrimitiveConverter { + static class FieldEnumConverter extends PrimitiveConverter { private final List<TProtocol> events; - - private Map<Binary, Integer> enumLookup = new HashMap<Binary, Integer>(); + private final Map<Binary, Integer> enumLookup = new HashMap<Binary, Integer>(); + private final ThriftField field; public FieldEnumConverter(List<TProtocol> events, ThriftField field) { this.events = events; + this.field = field; final Iterable<EnumValue> values = ((EnumType)field.getType()).getValues(); for (EnumValue enumValue : values) { enumLookup.put(Binary.fromString(enumValue.getName()), enumValue.getId()); @@ -445,7 +447,16 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { @Override public void addBinary(final Binary value) { - final int id = enumLookup.get(value); + final Integer id = enumLookup.get(value); + + if (id == null) { + throw new ParquetDecodingException("Unrecognized enum value: " + + value.toStringUsingUTF8() + + " known values: " + + enumLookup + + " in " + this.field); + } + events.add(new ParquetProtocol("readI32() enum") { @Override public int readI32() throws TException { @@ -461,7 +472,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class MapConverter extends GroupConverter { + static class MapConverter extends GroupConverter { private final GroupCounter child; private final List<TProtocol> mapEvents = new ArrayList<TProtocol>(); @@ -523,7 +534,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class MapKeyValueConverter extends GroupConverter { + static class MapKeyValueConverter extends GroupConverter { private Converter keyConverter; private Converter valueConverter; @@ -561,7 +572,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class SetConverter extends CollectionConverter { + static class SetConverter extends CollectionConverter { final ParquetProtocol readSetEnd = new ParquetProtocol("readSetEnd()") { @Override @@ -598,7 +609,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class ListConverter extends CollectionConverter { + static class ListConverter extends CollectionConverter { final ParquetProtocol readListEnd = new ParquetProtocol("readListEnd()") { @Override @@ -635,7 +646,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - abstract class CollectionConverter extends GroupConverter { + static abstract class CollectionConverter extends GroupConverter { private final Converter child; private final Counter childCounter; @@ -696,7 +707,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { * @author Julien Le Dem * */ - class StructConverter extends GroupConverter { + static class StructConverter extends GroupConverter { private final int schemaSize; @@ -794,7 +805,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { this.thriftReader = thriftReader; this.protocol = new ParquetReadProtocol(); this.thriftType = thriftType; - MessageType fullSchema = new ThriftSchemaConverter().convert(thriftType); + MessageType fullSchema = ThriftSchemaConverter.convertWithoutProjection(thriftType); missingRequiredFieldsInProjection = hasMissingRequiredFieldInGroupType(requestedParquetSchema, fullSchema); this.structConverter = new StructConverter(rootEvents, requestedParquetSchema, new ThriftField(name, (short)0, Requirement.REQUIRED, thriftType)); } @@ -863,7 +874,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> { return structConverter; } - private Converter newConverter(List<TProtocol> events, Type type, ThriftField field) { + private static Converter newConverter(List<TProtocol> events, Type type, ThriftField field) { switch (field.getType().getType()) { case LIST: return new ListConverter(events, type.asGroupType(), field); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java index 2c05c30..88effc5 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java @@ -76,16 +76,23 @@ import static org.apache.parquet.schema.Types.primitive; class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedField, ThriftSchemaConvertVisitor.State> { private final FieldProjectionFilter fieldProjectionFilter; private final boolean doProjection; + private final boolean keepOneOfEachUnion; - private ThriftSchemaConvertVisitor(FieldProjectionFilter fieldProjectionFilter, boolean doProjection) { + private ThriftSchemaConvertVisitor(FieldProjectionFilter fieldProjectionFilter, boolean doProjection, boolean keepOneOfEachUnion) { this.fieldProjectionFilter = checkNotNull(fieldProjectionFilter, "fieldProjectionFilter"); this.doProjection = doProjection; + this.keepOneOfEachUnion = keepOneOfEachUnion; } + @Deprecated public static MessageType convert(StructType struct, FieldProjectionFilter filter) { + return convert(struct, filter, true); + } + + public static MessageType convert(StructType struct, FieldProjectionFilter filter, boolean keepOneOfEachUnion) { State state = new State(new FieldsPath(), REPEATED, "ParquetSchema"); - ConvertedField converted = struct.accept(new ThriftSchemaConvertVisitor(filter, true), state); + ConvertedField converted = struct.accept(new ThriftSchemaConvertVisitor(filter, true, keepOneOfEachUnion), state); if (!converted.isKeep()) { throw new ThriftProjectionException("No columns have been selected"); @@ -134,7 +141,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie if (doProjection) { ConvertedField fullConvKey = keyField .getType() - .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false), keyState); + .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false, keepOneOfEachUnion), keyState); if (!fullConvKey.asKeep().getType().equals(convertedKey.asKeep().getType())) { throw new ThriftProjectionException("Cannot select only a subset of the fields in a map key, " + @@ -160,7 +167,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie // keep only the key, not the value ConvertedField sentinelValue = - valueField.getType().accept(new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true), valueState); + valueField.getType().accept(new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion), valueState); Type mapField = mapType( state.repetition, @@ -181,7 +188,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie if (isSet && doProjection) { ConvertedField fullConv = listLike .getType() - .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false), childState); + .accept(new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false, keepOneOfEachUnion), childState); if (!converted.asKeep().getType().equals(fullConv.asKeep().getType())) { throw new ThriftProjectionException("Cannot select only a subset of the fields in a set, " + "for path " + state.path); @@ -210,7 +217,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie // special care is taken when converting unions, // because we are actually both converting + projecting in // one pass, and unions need special handling when projecting. - final boolean isUnion = isUnion(structType.getStructOrUnionType()); + final boolean needsToKeepOneOfEachUnion = keepOneOfEachUnion && isUnion(structType.getStructOrUnionType()); boolean hasSentinelUnionColumns = false; boolean hasNonSentinelUnionColumns = false; @@ -223,7 +230,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie ConvertedField converted = child.getType().accept(this, childState); - if (isUnion && !converted.isKeep()) { + if (!converted.isKeep() && needsToKeepOneOfEachUnion) { // user is not keeping this "kind" of union, but we still need // to keep at least one of the primitives of this union around. // in order to know what "kind" of union each record is. @@ -232,7 +239,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie // re-do the recursion, with a new projection filter that keeps only // the first primitive it encounters ConvertedField firstPrimitive = child.getType().accept( - new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true), childState); + new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion), childState); convertedChildren.add(firstPrimitive.asKeep().getType().withId(child.getFieldId())); hasSentinelUnionColumns = true; http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java index 95d998b..98820c3 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java @@ -56,12 +56,26 @@ public class ThriftSchemaConverter { return convert(toStructType(thriftClass)); } + /** + * struct is assumed to contain valid structOrUnionType metadata when used with this method. + * This method may throw if structOrUnionType is unknown. + * + * Use convertWithoutProjection below to convert a StructType to MessageType + */ public MessageType convert(StructType struct) { - MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, fieldProjectionFilter); + MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, fieldProjectionFilter, true); fieldProjectionFilter.assertNoUnmatchedPatterns(); return messageType; } + /** + * struct is not required to have known structOrUnionType, which is useful + * for converting a StructType from an (older) file schema to a MessageType + */ + public static MessageType convertWithoutProjection(StructType struct) { + return ThriftSchemaConvertVisitor.convert(struct, FieldProjectionFilter.ALL_COLUMNS, false); + } + public static <T extends TBase<?,?>> StructOrUnionType structOrUnionType(Class<T> klass) { return TUnion.class.isAssignableFrom(klass) ? StructOrUnionType.UNION : StructOrUnionType.STRUCT; } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java index 0d05e76..b8d577d 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/CompatibilityRunner.java @@ -95,7 +95,7 @@ public class CompatibilityRunner { String className = arguments.pollFirst(); String storedPath = arguments.pollFirst(); File storeDir = new File(storedPath); - ThriftType.StructType structType = new ThriftSchemaConverter().toStructType((Class<? extends TBase<?, ?>>) Class.forName(className)); + ThriftType.StructType structType = ThriftSchemaConverter.toStructType((Class<? extends TBase<?, ?>>) Class.forName(className)); ObjectMapper mapper = new ObjectMapper(); String fileName = catName + ".json"; http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java index e7be3ea..ba27166 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestProtocolReadToWrite.java @@ -92,7 +92,7 @@ public class TestProtocolReadToWrite { private void writeReadCompare(TBase<?, ?> a) throws TException, InstantiationException, IllegalAccessException { - ProtocolPipe[] pipes = {new ProtocolReadToWrite(), new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType((Class<TBase<?, ?>>)a.getClass()))}; + ProtocolPipe[] pipes = {new ProtocolReadToWrite(), new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType((Class<TBase<?, ?>>)a.getClass()))}; for (ProtocolPipe p : pipes) { final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); @@ -110,7 +110,7 @@ public class TestProtocolReadToWrite { //handler will rethrow the exception for verifying purpose CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(AddressBook.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(AddressBook.class), countingHandler); final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); @@ -134,7 +134,7 @@ public class TestProtocolReadToWrite { @Test public void testUnrecognizedUnionMemberSchema() throws Exception { CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithUnionV1.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithUnionV1.class), countingHandler); final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); StructWithUnionV1 validUnion = new StructWithUnionV1("a valid struct", UnionV1.aLong(new ALong(17L))); @@ -164,7 +164,7 @@ public class TestProtocolReadToWrite { @Test public void testUnionWithExtraOrNoValues() throws Exception { CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithUnionV2.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithUnionV2.class), countingHandler); ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); @@ -229,7 +229,7 @@ public class TestProtocolReadToWrite { @Test public void testEnumMissingSchema() throws Exception { CountingErrorHandler countingHandler = new CountingErrorHandler(); - BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithEnum.class), countingHandler); + BufferedProtocolReadToWrite p = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithEnum.class), countingHandler); final ByteArrayOutputStream in = new ByteArrayOutputStream(); final ByteArrayOutputStream out = new ByteArrayOutputStream(); StructWithMoreEnum enumDefinedInOldDefinition = new StructWithMoreEnum(NumberEnumWithMoreValue.THREE); @@ -268,7 +268,7 @@ public class TestProtocolReadToWrite { fieldIgnoredCount++; } }; - BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructV3.class), countingHandler); + BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructV3.class), countingHandler); //Data has an extra field of type struct final ByteArrayOutputStream in = new ByteArrayOutputStream(); @@ -306,7 +306,7 @@ public class TestProtocolReadToWrite { } }; - BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(new ThriftSchemaConverter().toStructType(StructWithIndexStartsFrom4.class), countingHandler); + BufferedProtocolReadToWrite structForRead = new BufferedProtocolReadToWrite(ThriftSchemaConverter.toStructType(StructWithIndexStartsFrom4.class), countingHandler); //Data has an extra field of type struct final ByteArrayOutputStream in = new ByteArrayOutputStream(); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java new file mode 100644 index 0000000..e7f42ce --- /dev/null +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftMetaData.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.thrift; + +import java.util.ArrayList; + +import org.apache.parquet.thrift.struct.ThriftField; +import org.apache.parquet.thrift.struct.ThriftType.StructType; +import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TestThriftMetaData { + + /** + * Previously, ThriftMetaData.toString would try to instantiate thriftClassName, + * but there is no guarantee that that class is on the classpath, and it is in fact + * normal for that to be the case (for example, when a file was written with TBase objects + * but is being read with scrooge objects). + * + * See PARQUET-345 + */ + @Test + public void testToStringDoesNotThrow() { + + StructType descriptor = new StructType(new ArrayList<ThriftField>(), StructOrUnionType.STRUCT); + ThriftMetaData tmd = new ThriftMetaData("non existent class!!!", descriptor); + assertEquals("ThriftMetaData(thriftClassName: non existent class!!!, descriptor: {\n" + + " \"id\" : \"STRUCT\",\n" + + " \"children\" : [ ],\n" + + " \"structOrUnionType\" : \"STRUCT\"\n" + + "})", tmd.toString()); + + tmd = new ThriftMetaData("non existent class!!!", null); + assertEquals("ThriftMetaData(thriftClassName: non existent class!!!, descriptor: null)", tmd.toString()); + + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java new file mode 100644 index 0000000..1619dd5 --- /dev/null +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftRecordConverter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.thrift; + +import java.io.File; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; + +import org.apache.parquet.Files; +import org.apache.parquet.Strings; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.thrift.ThriftRecordConverter.FieldEnumConverter; +import org.apache.parquet.thrift.struct.ThriftField; +import org.apache.parquet.thrift.struct.ThriftField.Requirement; +import org.apache.parquet.thrift.struct.ThriftType; +import org.apache.parquet.thrift.struct.ThriftType.EnumType; +import org.apache.parquet.thrift.struct.ThriftType.EnumValue; +import org.apache.parquet.thrift.struct.ThriftType.StructType; +import org.apache.parquet.thrift.test.compat.StructWithUnionV1; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TProtocol; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class TestThriftRecordConverter { + @Test + public void testUnknownEnumThrowsGoodException() throws Exception { + EnumType et = new EnumType(Arrays.asList(new EnumValue(77, "hello"))); + ThriftField field = new ThriftField("name", (short) 1, Requirement.REQUIRED, et); + + ArrayList<TProtocol> events = new ArrayList<TProtocol>(); + + FieldEnumConverter conv = new FieldEnumConverter(events, field); + + conv.addBinary(Binary.fromString("hello")); + + assertEquals(1, events.size()); + assertEquals(77, events.get(0).readI32()); + + try { + conv.addBinary(Binary.fromString("FAKE_ENUM_VALUE")); + fail("this should throw"); + } catch (ParquetDecodingException e) { + assertEquals("Unrecognized enum value: FAKE_ENUM_VALUE known values: {Binary{\"hello\"}=77} in {\n" + + " \"name\" : \"name\",\n" + + " \"fieldId\" : 1,\n" + + " \"requirement\" : \"REQUIRED\",\n" + + " \"type\" : {\n" + + " \"id\" : \"ENUM\",\n" + + " \"values\" : [ {\n" + + " \"id\" : 77,\n" + + " \"name\" : \"hello\"\n" + + " } ]\n" + + " }\n" + + "}", e.getMessage()); + } + } + + @Test + public void constructorDoesNotRequireStructOrUnionTypeMeta() throws Exception { + String jsonWithNoStructOrUnionMeta = Strings.join( + Files.readAllLines( + new File("src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json"), + Charset.forName("UTF-8")), "\n"); + + StructType noStructOrUnionMeta = (StructType) ThriftType.fromJSON(jsonWithNoStructOrUnionMeta); + + // this used to throw, see PARQUET-346 + new ThriftRecordConverter<StructWithUnionV1>( + new ThriftReader<StructWithUnionV1>() { + @Override + public StructWithUnionV1 readOneRecord(TProtocol protocol) throws TException { + return null; + } + }, + "name", + new ThriftSchemaConverter().convert(StructWithUnionV1.class), + noStructOrUnionMeta + ); + } +} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java index f45f175..c320f71 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/TestThriftToPigCompatibility.java @@ -154,7 +154,7 @@ public class TestThriftToPigCompatibility { final Class<T> class1 = (Class<T>) o.getClass(); final MessageType schema = thriftSchemaConverter.convert(class1); - final StructType structType = thriftSchemaConverter.toStructType(class1); + final StructType structType = ThriftSchemaConverter.toStructType(class1); final ThriftToPig<T> thriftToPig = new ThriftToPig<T>(class1); final Schema pigSchema = thriftToPig.toSchema(); final TupleRecordMaterializer tupleRecordConverter = new TupleRecordMaterializer(schema, pigSchema, true); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java b/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java index f07aa9d..df034ba 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/thrift/struct/CompatibilityCheckerTest.java @@ -116,7 +116,7 @@ public class CompatibilityCheckerTest { } private ThriftType.StructType struct(Class thriftClass) { - return new ThriftSchemaConverter().toStructType(thriftClass); + return ThriftSchemaConverter.toStructType(thriftClass); } private CompatibilityReport getCompatibilityReport(Class oldClass, Class newClass) { http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b86f68e3/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json b/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json new file mode 100644 index 0000000..ac42b76 --- /dev/null +++ b/parquet-thrift/src/test/resources/org/apache/parquet/thrift/StructWithUnionV1NoStructOrUnionMeta.json @@ -0,0 +1,49 @@ +{ + "id" : "STRUCT", + "children" : [ { + "name" : "name", + "fieldId" : 1, + "requirement" : "REQUIRED", + "type" : { + "id" : "STRING" + } + }, { + "name" : "aUnion", + "fieldId" : 2, + "requirement" : "REQUIRED", + "type" : { + "id" : "STRUCT", + "children" : [ { + "name" : "aString", + "fieldId" : 1, + "requirement" : "DEFAULT", + "type" : { + "id" : "STRUCT", + "children" : [ { + "name" : "s", + "fieldId" : 1, + "requirement" : "REQUIRED", + "type" : { + "id" : "STRING" + } + } ] + } + }, { + "name" : "aLong", + "fieldId" : 2, + "requirement" : "DEFAULT", + "type" : { + "id" : "STRUCT", + "children" : [ { + "name" : "l", + "fieldId" : 1, + "requirement" : "REQUIRED", + "type" : { + "id" : "I64" + } + } ] + } + } ] + } + } ] +} \ No newline at end of file
