autumnust commented on a change in pull request #2954: [GOBBLIN-1114] OrcValueMapper schema evolution up-conversion recursive URL: https://github.com/apache/incubator-gobblin/pull/2954#discussion_r407135539
########## File path: gobblin-compaction/src/main/java/org/apache/gobblin/compaction/mapreduce/orc/OrcValueMapper.java ########## @@ -24,58 +24,80 @@ import org.apache.gobblin.compaction.mapreduce.RecordKeyMapperBase; import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.orc.OrcConf; import org.apache.orc.TypeDescription; import org.apache.orc.impl.ConvertTreeReaderFactory; import org.apache.orc.impl.SchemaEvolution; import org.apache.orc.mapred.OrcKey; +import org.apache.orc.mapred.OrcList; +import org.apache.orc.mapred.OrcMap; import org.apache.orc.mapred.OrcStruct; +import org.apache.orc.mapred.OrcUnion; import org.apache.orc.mapred.OrcValue; import org.apache.orc.mapreduce.OrcMapreduceRecordReader; +import com.google.common.annotations.VisibleForTesting; + +import lombok.extern.slf4j.Slf4j; + /** * To keep consistent with {@link OrcMapreduceRecordReader}'s decision on implementing * {@link RecordReader} with {@link NullWritable} as the key and generic type of value, the ORC Mapper will * read in the record as the input value. */ +@Slf4j public class OrcValueMapper extends RecordKeyMapperBase<NullWritable, OrcStruct, Object, OrcValue> { private OrcValue outValue; private TypeDescription mapperSchema; + // This is added mostly for debuggability. + private static int writeCount = 0; + @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); this.outValue = new OrcValue(); - this.mapperSchema = TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute())); + this.mapperSchema = + TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute())); } @Override protected void map(NullWritable key, OrcStruct orcStruct, Context context) throws IOException, InterruptedException { - OrcStruct upConvertedStruct = upConvertOrcStruct(orcStruct, context); - if (context.getNumReduceTasks() == 0) { - this.outValue.value = upConvertedStruct; - context.write(NullWritable.get(), this.outValue); - } else { - this.outValue.value = upConvertedStruct; - context.write(getDedupKey(upConvertedStruct), this.outValue); + OrcStruct upConvertedStruct = upConvertOrcStruct(orcStruct, mapperSchema); + try { + if (context.getNumReduceTasks() == 0) { + this.outValue.value = upConvertedStruct; + context.write(NullWritable.get(), this.outValue); + } else { + this.outValue.value = upConvertedStruct; + context.write(getDedupKey(upConvertedStruct), this.outValue); + } + } catch (Exception e) { + throw new RuntimeException("Failure in write record no." + writeCount, e); Review comment: Yes, finding the filename here seems non-trivial since I need the split id to be available in the context object. I have another ticket to track this work. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services