markap14 commented on a change in pull request #5772: URL: https://github.com/apache/nifi/pull/5772#discussion_r812044414
########## File path: nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/enrichment/IndexCorrelatedJoinStrategy.java ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nifi.processors.standard.enrichment; + +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.logging.ComponentLog; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.serialization.MalformedRecordException; +import org.apache.nifi.serialization.RecordReader; +import org.apache.nifi.serialization.record.Record; +import org.apache.nifi.serialization.record.RecordSchema; +import org.apache.nifi.serialization.record.RecordSet; + +import java.io.IOException; +import java.io.InputStream; + +public abstract class IndexCorrelatedJoinStrategy implements RecordJoinStrategy { + private final ComponentLog logger; + + public IndexCorrelatedJoinStrategy(final ComponentLog logger) { + this.logger = logger; + } + + protected ComponentLog getLogger() { + return logger; + } + + @Override + public RecordJoinResult join(final RecordJoinInput originalInput, final RecordJoinInput enrichmentInput, final ProcessSession session, final RecordSchema writerSchema) throws Exception { + final FlowFile originalFlowFile = originalInput.getFlowFile(); + final FlowFile enrichmentFlowFile = enrichmentInput.getFlowFile(); + + InputStream originalIn = null; + RecordReader originalRecordReader = null; + InputStream enrichmentIn = null; + RecordReader enrichmentRecordReader = null; + + try { + originalIn = session.read(originalFlowFile); + originalRecordReader = originalInput.getRecordReaderFactory().createRecordReader(originalFlowFile, originalIn, logger); + + enrichmentIn = session.read(enrichmentFlowFile); + enrichmentRecordReader = enrichmentInput.getRecordReaderFactory().createRecordReader(enrichmentFlowFile, enrichmentIn, logger); + + final Record firstOriginalRecord = originalRecordReader.nextRecord(); + final Record firstEnrichmentRecord = enrichmentRecordReader.nextRecord(); + + final RecordSchema resultSchema = createResultSchema(firstOriginalRecord, firstEnrichmentRecord); + + final InputStream finalOriginalIn = originalIn; + final RecordReader finalOriginalRecordReader = originalRecordReader; + final InputStream finalEnrichmentIn = enrichmentIn; + final RecordReader finalEnrichmentRecordReader = enrichmentRecordReader; + + final RecordSet recordSet = new RecordSet() { + private boolean usedFirstRecords = false; + + @Override + public RecordSchema getSchema() { + return resultSchema; + } + + @Override + public Record next() throws IOException { + if (!usedFirstRecords) { Review comment: So, in order to derive the schema for the Record Writer, we need to get the schema from both the 'original' and 'enrichment' readers. We need to then combine the records together to derive the output schema. In the case of `InsertRecordFieldsJoinStrategy`. The only way to do this is to obtain a record from each reader and then combine them and call `incorporateInactiveFields` on the resultant Record. So we do that to derive the output schema for the RecordWriter. But now we've already consumed the first record. So the first time through, we need to return that first Record. The rest of the times through, we just return the next Records. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
