fishy commented on a change in pull request #2439: URL: https://github.com/apache/thrift/pull/2439#discussion_r706631147
########## File path: lib/java/src/org/apache/thrift/partial/PartialThriftDeserializer.java ########## @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.thrift.partial; + +import org.apache.thrift.partial.Validate; + +import org.apache.thrift.TBase; +import org.apache.thrift.TEnum; +import org.apache.thrift.TException; +import org.apache.thrift.TFieldIdEnum; +import org.apache.thrift.meta_data.EnumMetaData; +import org.apache.thrift.meta_data.StructMetaData; +import org.apache.thrift.protocol.TList; +import org.apache.thrift.protocol.TMap; +import org.apache.thrift.protocol.TSet; +import org.apache.thrift.protocol.TType; + +/** + * Supports partial deserialization of serialized Thrift data. + * + * The end result of deserialization can be a Thrift object (? extends TBase) + * or it could be a different construct. It is achieved by using a + * ThriftFieldValueProcessor instance which processes each field value + * as it gets deserialized. + */ +public class PartialThriftDeserializer<T extends TBase> { + + // Metadata that describes fields to deserialize. + private final ThriftMetadata.ThriftStruct metadata; + + // Processor that handles deserialized field values. + private final ThriftFieldValueProcessor processor; + + // Partial thrift protocol to use for deserialization. + private final PartialThriftProtocol protocol; + + /** + * Constructs an instance of PartialThriftDeserializer. + * + * @param metadata the Metadata that describes fields to deserialize. + * @param processor the Processor that handles deserialized field values. + * @param protocol the Partial thrift protocol to use for deserialization. + */ + public PartialThriftDeserializer( + ThriftMetadata.ThriftStruct metadata, + ThriftFieldValueProcessor processor, + PartialThriftProtocol protocol) { + + Validate.checkNotNull(metadata, "metadata"); + Validate.checkNotNull(processor, "processor"); + Validate.checkNotNull(protocol, "protocol"); + + this.metadata = metadata; + this.processor = processor; + this.protocol = protocol; + } + + /** + * Gets the Thrift metadata used by this instance. + */ + public ThriftMetadata.ThriftStruct getMetadata() { + return this.metadata; + } + + /** + * Deserializes the given serialized blob. + * + * @param bytes the serialized blob. + * @return deserialized instance. + * @throws TException if an error is encountered during deserialization. + */ + public Object deserialize(byte[] bytes) throws TException { Review comment: I'm still not convinced that this API need to be so dramatically different from the existing TDeserializer API (https://www.javadoc.io/doc/org.apache.thrift/libthrift/latest/org/apache/thrift/TDeserializer.html), can you try to match that as much as possible? The current TDeserializer API in Java already provided `partiallyDeserialize*` functions. I'm really worried about how this API will map to other languages, even if it makes sense for the Java case. ########## File path: lib/java/src/org/apache/thrift/partial/README.md ########## @@ -0,0 +1,112 @@ +# Partial Thrift Deserialization + +## Overview +This document describes how partial deserialization of Thrift works. There are two main goals of this documentation: +1. Make it easier to understand the current Java implementation in this folder. +1. Be useful in implementing partial deserialization support in additional languages. + +This document is divided into two high level areas. The first part explains important concepts relevant to partial deserialization. The second part describes components involved in the Java implementation in this folder. + +Moreover, this blog provides some performance numbers and addtional information: https://medium.com/pinterest-engineering/improving-data-processing-efficiency-using-partial-deserialization-of-thrift-16bc3a4a38b4 + +## Basic Concepts + +### Motivation + +The main motivation behind implementing this feature is to improve performance when we need to access only a subset of fields in any Thrift object. This situation arises often when big data is stored in Thrift encoded format (for example, SequenceFile with serialized Thrift values). Many data processing jobs may access this data. However, not every job needs to access every field of each object. In such cases, if we have prior knowledge of the fields needed for a given job, we can deserialize only that subset of fields and avoid the cost deserializing the rest of the fields. There are two benefits of this approach: we save cpu cycles by not deserializing unnecessary field and we end up reducing gc pressure. Both of the savings quickly add up when processing billions of instances in a data processing job. + +### Partial deserialization + +Partial deserialization involves deserializing only a subset of the fields of a serialized Thrift object while efficiently skipping over the rest. One very important benefit of partial deserialization is that the output of the deserialization process is not limited to a `TBase` derived object. It can deserialize a serialized blob into any type by using an appropriate `ThriftFieldValueProcessor`. + +### Defining the subset of fields to deserialize + +The subset of fields to deserialize is defined using a list of fully qualified field names. For example, consider the Thrift `struct` definition below: + +```Thrift +struct SmallStruct { + 1: optional string stringValue; + 2: optional i16 i16Value; +} + +struct TestStruct { + 1: optional i16 i16Field; + 2: optional list<SmallStruct> structList; + 3: optional set<SmallStruct> structSet; + 4: optional map<string, SmallStruct> structMap; + 5: optional SmallStruct structField; +} +``` + +For the Thrift `struct`, each of the following line shows a fully qualified field definition. Partial deserialization uses a non-empty set of such field definitions to identify the subset of fields to deserialize. + +``` +- i16Field +- structList.stringValue +- structSet.i16Value +- structMap.stringValue +- structField.i16Value +``` + +Note that the syntax of denoting paths involving map fields do not support a way to define sub-fields of the key type. + +For example, the field path `structMap.stringValue` shown above has leaf segment `stringValue` which is a field in map values. + +## Components + +The process of partial deserialization involves the following major components. We have listed names of the Java file(s) implementing each component for easier mapping to the source code. + +### Thrift Metadata + +Source files: +- ThriftField.java +- ThriftMetadata.java + +We saw in the previous section how we can identify the subset of fields to deserialize. As the first step, we need to compile the collection of field definitions into an efficient data structure that we can traverse at runtime. This step is achieved using `ThriftField` and `ThriftMetadata` classes. For example, + +```Java +// First, create a collection of fully qualified field names. +List<String> fieldNames = Arrays.asList("i16Field", "structField.i16Value"); + +// Convert the flat collection into an n-ary tree of fields. +List<ThriftField> fields = ThriftField.fromNames(fieldNames); + +// Compile the tree of fields into internally used metadata. +ThriftMetadata.ThriftStruct metadata = + ThriftMetadata.ThriftStruct.fromFields(TestStruct.class, fields); +``` + +At this point, we have an efficient internal representation of the fields that need to get deserialized. + +### Partial Thrift Protocol + +Source files: +- PartialThriftProtocol.java +- PartialThriftBinaryProtocol.java +- PartialThriftCompactProtocol.java + +This component implements efficient skipping over fields that need not be deserialized. Note that this skipping is more efficient compared to that achieved by using `TProtocolUtil.skip()`. The latter calls the corresponding `read()`, allocates and initializes certain values (for example, strings) and then discards the returned value. In comparison, `PartialThriftProtocol` skips a field by incrementing internal offset into the transport buffer. Review comment: The current `TProtocolUtil.skip` requires a `TProtocol` arg (https://www.javadoc.io/static/org.apache.thrift/libthrift/0.11.0/org/apache/thrift/protocol/TProtocolUtil.html), so you can totally then delegate to the concrete TProtocol to decide how to skip that field in the most efficient way. (for example, in Go we already have `Skip` function defined in the `TProtocol` interface, so it can be different from `read*`. we can probably do something similar in java.) ########## File path: lib/java/src/org/apache/thrift/partial/PartialThriftDeserializer.java ########## @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.thrift.partial; + +import org.apache.thrift.partial.Validate; + +import org.apache.thrift.TBase; +import org.apache.thrift.TEnum; +import org.apache.thrift.TException; +import org.apache.thrift.TFieldIdEnum; +import org.apache.thrift.meta_data.EnumMetaData; +import org.apache.thrift.meta_data.StructMetaData; +import org.apache.thrift.protocol.TList; +import org.apache.thrift.protocol.TMap; +import org.apache.thrift.protocol.TSet; +import org.apache.thrift.protocol.TType; + +/** + * Supports partial deserialization of serialized Thrift data. + * + * The end result of deserialization can be a Thrift object (? extends TBase) + * or it could be a different construct. It is achieved by using a + * ThriftFieldValueProcessor instance which processes each field value + * as it gets deserialized. + */ +public class PartialThriftDeserializer<T extends TBase> { + + // Metadata that describes fields to deserialize. + private final ThriftMetadata.ThriftStruct metadata; + + // Processor that handles deserialized field values. + private final ThriftFieldValueProcessor processor; + + // Partial thrift protocol to use for deserialization. + private final PartialThriftProtocol protocol; + + /** + * Constructs an instance of PartialThriftDeserializer. + * + * @param metadata the Metadata that describes fields to deserialize. + * @param processor the Processor that handles deserialized field values. + * @param protocol the Partial thrift protocol to use for deserialization. + */ + public PartialThriftDeserializer( + ThriftMetadata.ThriftStruct metadata, + ThriftFieldValueProcessor processor, + PartialThriftProtocol protocol) { + + Validate.checkNotNull(metadata, "metadata"); + Validate.checkNotNull(processor, "processor"); + Validate.checkNotNull(protocol, "protocol"); + + this.metadata = metadata; + this.processor = processor; + this.protocol = protocol; + } + + /** + * Gets the Thrift metadata used by this instance. + */ + public ThriftMetadata.ThriftStruct getMetadata() { + return this.metadata; + } + + /** + * Deserializes the given serialized blob. + * + * @param bytes the serialized blob. + * @return deserialized instance. + * @throws TException if an error is encountered during deserialization. + */ + public Object deserialize(byte[] bytes) throws TException { + return this.deserialize(bytes, 0, bytes.length); + } + + /** + * Deserializes the given serialized blob. + * + * @param bytes the serialized blob. + * @param offset the blob is read starting at this offset. + * @param length the size of blob read (in number of bytes). + * @return deserialized instance. + * @throws TException if an error is encountered during deserialization. + */ + public Object deserialize(byte[] bytes, int offset, int length) throws TException { + this.protocol.reset(bytes, offset, length); + return this.deserializeStruct(this.protocol, this.metadata); + } + + private Object deserialize( + PartialThriftProtocol tprot, + ThriftMetadata.ThriftObject data) throws TException { + + Object value; + byte fieldType = data.data.valueMetaData.type; + switch (fieldType) { + case TType.STRUCT: + return this.deserializeStruct(tprot, (ThriftMetadata.ThriftStruct) data); + + case TType.LIST: + return this.deserializeList(tprot, (ThriftMetadata.ThriftList) data); + + case TType.MAP: + return this.deserializeMap(tprot, (ThriftMetadata.ThriftMap) data); + + case TType.SET: + return this.deserializeSet(tprot, (ThriftMetadata.ThriftSet) data); + + case TType.ENUM: + return this.deserializeEnum(tprot, (ThriftMetadata.ThriftEnum) data); + + case TType.BOOL: + return tprot.readBool(); + + case TType.BYTE: + return tprot.readByte(); + + case TType.I16: + return tprot.readI16(); + + case TType.I32: + return tprot.readI32(); + + case TType.I64: + return tprot.readI64(); + + case TType.DOUBLE: + return tprot.readDouble(); + + case TType.STRING: + if (((ThriftMetadata.ThriftPrimitive) data).isBinary()) { + return this.processor.prepareBinary(tprot.readBinary()); + } else { + return this.processor.prepareString(tprot.readBinary()); + } + + default: + throw unsupportedFieldTypeException(fieldType); + } + } + + private Object deserializeStruct(PartialThriftProtocol tprot, ThriftMetadata.ThriftStruct data) + throws TException { + + if (data.fields.size() == 0) { + return this.fullDeserialize(tprot, data); + } + + Object instance = this.processor.createNewStruct(data); + tprot.readStructBegin(); + while (true) { + int tfieldData = tprot.readFieldBeginData(); + byte tfieldType = TFieldData.getType(tfieldData); + if (tfieldType == TType.STOP) { + break; + } + + Integer id = (int) TFieldData.getId(tfieldData); + ThriftMetadata.ThriftObject field = (ThriftMetadata.ThriftObject) data.fields.get(id); + + if (field != null) { + this.deserializeStructField(tprot, instance, field.fieldId, field); + } else { + tprot.skip(tfieldType); + } + tprot.readFieldEnd(); + } + tprot.readStructEnd(); + + return this.processor.prepareStruct(instance); + } + + private void deserializeStructField( + PartialThriftProtocol tprot, + Object instance, + TFieldIdEnum fieldId, + ThriftMetadata.ThriftObject data) throws TException { + + byte fieldType = data.data.valueMetaData.type; + Object value; + + switch (fieldType) { + case TType.BOOL: + this.processor.setBool(instance, fieldId, tprot.readBool()); + break; + + case TType.BYTE: + this.processor.setByte(instance, fieldId, tprot.readByte()); + break; + + case TType.I16: + this.processor.setInt16(instance, fieldId, tprot.readI16()); + break; + + case TType.I32: + this.processor.setInt32(instance, fieldId, tprot.readI32()); + break; + + case TType.I64: + this.processor.setInt64(instance, fieldId, tprot.readI64()); + break; + + case TType.DOUBLE: + this.processor.setDouble(instance, fieldId, tprot.readDouble()); + break; + + case TType.STRING: + if (((ThriftMetadata.ThriftPrimitive) data).isBinary()) { + this.processor.setBinary(instance, fieldId, tprot.readBinary()); + } else { + this.processor.setString(instance, fieldId, tprot.readBinary()); + } + break; + + case TType.STRUCT: + value = this.deserializeStruct(tprot, (ThriftMetadata.ThriftStruct) data); + this.processor.setStructField(instance, fieldId, value); + break; + + case TType.LIST: + value = this.deserializeList(tprot, (ThriftMetadata.ThriftList) data); + this.processor.setListField(instance, fieldId, value); + break; + + case TType.MAP: + value = this.deserializeMap(tprot, (ThriftMetadata.ThriftMap) data); + this.processor.setMapField(instance, fieldId, value); + break; + + case TType.SET: + value = this.deserializeSet(tprot, (ThriftMetadata.ThriftSet) data); + this.processor.setSetField(instance, fieldId, value); + break; + + case TType.ENUM: + value = this.deserializeEnum(tprot, (ThriftMetadata.ThriftEnum) data); + this.processor.setEnumField(instance, fieldId, value); + break; + + default: + throw new RuntimeException("Unsupported field type: " + fieldId.toString()); + } + } + + private Object deserializeList(PartialThriftProtocol tprot, ThriftMetadata.ThriftList data) + throws TException { + + TList tlist = tprot.readListBegin(); + Object instance = this.processor.createNewList(tlist.size); + for (int i = 0; i < tlist.size; i++) { + Object value = this.deserialize(tprot, data.elementData); + this.processor.setListElement(instance, i, value); + } + tprot.readListEnd(); + return this.processor.prepareList(instance); + } + + private Object deserializeMap(PartialThriftProtocol tprot, ThriftMetadata.ThriftMap data) + throws TException { + TMap tmap = tprot.readMapBegin(); + Object instance = this.processor.createNewMap(tmap.size); + for (int i = 0; i < tmap.size; i++) { + Object key = this.deserialize(tprot, data.keyData); + Object val = this.deserialize(tprot, data.valueData); + this.processor.setMapElement(instance, i, key, val); + } + tprot.readMapEnd(); + return this.processor.prepareMap(instance); + } + + private Object deserializeSet(PartialThriftProtocol tprot, ThriftMetadata.ThriftSet data) + throws TException { + TSet tset = tprot.readSetBegin(); + Object instance = this.processor.createNewSet(tset.size); + for (int i = 0; i < tset.size; i++) { + Object eltValue = this.deserialize(tprot, data.elementData); + this.processor.setSetElement(instance, i, eltValue); + } + tprot.readSetEnd(); + return this.processor.prepareSet(instance); + } + + private Object deserializeEnum(PartialThriftProtocol tprot, ThriftMetadata.ThriftEnum data) + throws TException { + int ordinal = tprot.readI32(); + Class<? extends TEnum> enumClass = ((EnumMetaData) data.data.valueMetaData).enumClass; + return this.processor.prepareEnum(enumClass, ordinal); + } + + private TBase fullDeserialize(PartialThriftProtocol tprot, ThriftMetadata.ThriftStruct data) + throws TException { + Validate.checkState( + data.fields.size() == 0, "Cannot fully deserialize when some fields specified"); Review comment: did you? the error message looks still the same as before? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
