[GitHub] nifi pull request #2671: NiFi-5102 - Adding Processors for MarkLogic DB

joewitt Tue, 29 May 2018 18:09:31 -0700

Github user joewitt commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/2671#discussion_r191618134
  
    --- Diff: 
nifi-nar-bundles/nifi-marklogic-bundle/nifi-marklogic-processors/src/main/java/com/marklogic/nifi/processor/PutMarkLogic.java
 ---
    @@ -0,0 +1,399 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package com.marklogic.nifi.processor;
    +
    +import com.marklogic.client.datamovement.DataMovementManager;
    +import com.marklogic.client.datamovement.WriteBatcher;
    +import com.marklogic.client.datamovement.WriteEvent;
    +import com.marklogic.client.datamovement.impl.WriteEventImpl;
    +import com.marklogic.client.document.ServerTransform;
    +import com.marklogic.client.io.BytesHandle;
    +import com.marklogic.client.io.DocumentMetadataHandle;
    +import com.marklogic.client.io.Format;
    +import org.apache.nifi.annotation.behavior.SystemResource;
    +import org.apache.nifi.annotation.behavior.SystemResourceConsideration;
    +import org.apache.nifi.annotation.behavior.TriggerWhenEmpty;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.annotation.lifecycle.OnScheduled;
    +import org.apache.nifi.annotation.lifecycle.OnStopped;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.components.Validator;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.flowfile.attributes.CoreAttributes;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.ProcessSessionFactory;
    +import org.apache.nifi.processor.ProcessorInitializationContext;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.exception.ProcessException;
    +import org.apache.nifi.processor.util.StandardValidators;
    +import org.apache.nifi.stream.io.StreamUtils;
    +
    +import java.util.ArrayList;
    +import java.util.Collections;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Set;
    +
    +
    +/**
    + * The TriggerWhenEmpty annotation is used so that this processor has a 
chance to flush the WriteBatcher when no
    + * flowfiles are ready to be received.
    + */
    +@Tags({"MarkLogic", "Put", "Write", "Insert"})
    +@CapabilityDescription("Write batches of FlowFiles as documents to a 
MarkLogic server using the " +
    +    "MarkLogic Data Movement SDK (DMSDK)")
    +@SystemResourceConsideration(resource = SystemResource.MEMORY)
    +@TriggerWhenEmpty
    +public class PutMarkLogic extends AbstractMarkLogicProcessor {
    +
    +    class FlowFileInfo {
    +        FlowFile flowFile;
    +        ProcessSession session;
    +        FlowFileInfo(FlowFile flowFile, ProcessSession session) {
    +            this.flowFile = flowFile;
    +            this.session = session;
    +        }
    +    }
    +    private Map<String, FlowFileInfo> uriFlowFileMap = new HashMap<>();
    +    public static final PropertyDescriptor COLLECTIONS = new 
PropertyDescriptor.Builder()
    +        .name("Collections")
    +        .displayName("Collections")
    +        .description("Comma-delimited sequence of collections to add to 
each document")
    +        .required(false)
    +        .addValidator(Validator.VALID)
    +        .build();
    +
    +    public static final PropertyDescriptor FORMAT = new 
PropertyDescriptor.Builder()
    +        .name("Format")
    +        .displayName("Format")
    +        .description("Format for each document; if not specified, 
MarkLogic will determine the format" +
    +            " based on the URI")
    +        .allowableValues(Format.JSON.name(), Format.XML.name(), 
Format.TEXT.name(), Format.BINARY.name(), Format.UNKNOWN.name())
    +        .required(false)
    +        .addValidator(Validator.VALID)
    +        .build();
    +
    +    public static final PropertyDescriptor JOB_ID = new 
PropertyDescriptor.Builder()
    +        .name("Job ID")
    +        .displayName("Job ID")
    +        .description("ID for the WriteBatcher job")
    +        .required(false)
    +        .addValidator(Validator.VALID)
    +        .build();
    +
    +    public static final PropertyDescriptor JOB_NAME = new 
PropertyDescriptor.Builder()
    +        .name("Job Name")
    +        .displayName("Job Name")
    +        .description("Name for the WriteBatcher job")
    +        .required(false)
    +        .addValidator(Validator.VALID)
    +        .build();
    +
    +    public static final PropertyDescriptor MIMETYPE = new 
PropertyDescriptor.Builder()
    +        .name("MIME type")
    +        .displayName("MIME type")
    +        .description("MIME type for each document; if not specified, 
MarkLogic will determine the " +
    +            "MIME type based on the URI")
    +        .addValidator(Validator.VALID)
    +        .required(false)
    +        .build();
    +
    +    public static final PropertyDescriptor PERMISSIONS = new 
PropertyDescriptor.Builder()
    +        .name("Permissions")
    +        .displayName("Permissions")
    +        .defaultValue("rest-reader,read,rest-writer,update")
    +        .description("Comma-delimited sequence of permissions - role1, 
capability1, role2, " +
    +            "capability2 - to add to each document")
    +        .addValidator(Validator.VALID)
    +        .required(false)
    +        .build();
    +
    +    public static final PropertyDescriptor TEMPORAL_COLLECTION = new 
PropertyDescriptor.Builder()
    +        .name("Temporal collection")
    +        .displayName("Temporal collection")
    +        .description("The temporal collection to use for a temporal 
document insert")
    +        .addValidator(Validator.VALID)
    +        .required(false)
    +        .build();
    +
    +    public static final PropertyDescriptor TRANSFORM = new 
PropertyDescriptor.Builder()
    +        .name("Server transform")
    +        .displayName("Server transform")
    +        .description("The name of REST server transform to apply to every 
document as it's" +
    +            " written")
    +        .addValidator(Validator.VALID)
    +        .required(false)
    +        .build();
    +
    +    public static final PropertyDescriptor URI_ATTRIBUTE_NAME = new 
PropertyDescriptor.Builder()
    +        .name("URI attribute name")
    +        .displayName("URI attribute name")
    +        .defaultValue("uuid")
    +        .required(true)
    +        .description("The name of the FlowFile attribute whose value will 
be used as the URI")
    +        .addValidator(StandardValidators.NON_BLANK_VALIDATOR)
    +        .build();
    +
    +    public static final PropertyDescriptor URI_PREFIX = new 
PropertyDescriptor.Builder()
    +        .name("URI prefix")
    +        .displayName("URI prefix")
    +        .description("The prefix to prepend to each URI")
    +        .required(false)
    +        .addValidator(Validator.VALID)
    +        .build();
    +
    +    public static final PropertyDescriptor URI_SUFFIX = new 
PropertyDescriptor.Builder()
    +        .name("URI suffix")
    +        .displayName("URI suffix")
    +        .description("The suffix to append to each URI")
    +        .required(false)
    +        .addValidator(Validator.VALID)
    +        .build();
    +
    +    protected static final Relationship SUCCESS = new 
Relationship.Builder()
    +        .name("SUCCESS")
    +        .description("All FlowFiles that are successfully written to 
MarkLogic are routed to the " +
    +            "success relationship for future processing.")
    +        .build();
    +
    +    protected static final Relationship FAILURE = new 
Relationship.Builder()
    +        .name("FAILURE")
    +        .description("All FlowFiles that failed to be written to MarkLogic 
are routed to the " +
    +            "failure relationship for future processing.")
    +        .build();
    +
    +    private volatile DataMovementManager dataMovementManager;
    +    private volatile WriteBatcher writeBatcher;
    +    // If no FlowFile exists when this processor is triggered, this 
variable determines whether or not a call is made to
    +    // flush the WriteBatcher
    +    private volatile boolean shouldFlushIfEmpty = true;
    +
    +    @Override
    +    public void init(ProcessorInitializationContext context) {
    +        super.init(context);
    +
    +        List<PropertyDescriptor> list = new ArrayList<>();
    +        list.addAll(properties);
    +        list.add(COLLECTIONS);
    +        list.add(FORMAT);
    +        list.add(JOB_ID);
    +        list.add(JOB_NAME);
    +        list.add(MIMETYPE);
    +        list.add(PERMISSIONS);
    +        list.add(TRANSFORM);
    +        list.add(TEMPORAL_COLLECTION);
    +        list.add(URI_ATTRIBUTE_NAME);
    +        list.add(URI_PREFIX);
    +        list.add(URI_SUFFIX);
    +        properties = Collections.unmodifiableList(list);
    +        Set<Relationship> set = new HashSet<>();
    +        set.add(SUCCESS);
    +        set.add(FAILURE);
    +        relationships = Collections.unmodifiableSet(set);
    +    }
    +
    +    @OnScheduled
    +    public void onScheduled(ProcessContext context) {
    +        dataMovementManager = 
getDatabaseClient(context).newDataMovementManager();
    +        writeBatcher = dataMovementManager.newWriteBatcher()
    +            .withJobId(context.getProperty(JOB_ID).getValue())
    +            .withJobName(context.getProperty(JOB_NAME).getValue())
    +            .withBatchSize(context.getProperty(BATCH_SIZE).asInteger())
    +            
.withTemporalCollection(context.getProperty(TEMPORAL_COLLECTION).getValue());
    +
    +        final String transform = context.getProperty(TRANSFORM).getValue();
    +        if (transform != null) {
    +            writeBatcher.withTransform(new ServerTransform(transform));
    +        }
    +        Integer threadCount = 
context.getProperty(THREAD_COUNT).asInteger();
    +        if(threadCount != null) {
    +            writeBatcher.withThreadCount(threadCount);
    +        }
    +        this.writeBatcher.onBatchSuccess(writeBatch -> {
    +            for(WriteEvent writeEvent : writeBatch.getItems()) {
    +                routeDocumentToRelationship(writeEvent, SUCCESS);
    +            }
    +        }).onBatchFailure((writeBatch, throwable) -> {
    +            for(WriteEvent writeEvent : writeBatch.getItems()) {
    +                routeDocumentToRelationship(writeEvent, FAILURE);
    +            }
    +        });
    +        dataMovementManager.startJob(writeBatcher);
    +    }
    +
    +    private void routeDocumentToRelationship(WriteEvent writeEvent, 
Relationship relationship) {
    +        DocumentMetadataHandle metadata = (DocumentMetadataHandle) 
writeEvent.getMetadata();
    +        String flowFileUUID = 
metadata.getMetadataValues().get("flowFileUUID");
    +        FlowFileInfo flowFile = uriFlowFileMap.get(flowFileUUID);
    +        if(flowFile != null) {
    +            
flowFile.session.getProvenanceReporter().send(flowFile.flowFile, 
writeEvent.getTargetUri());
    +            flowFile.session.transfer(flowFile.flowFile, relationship);
    +            flowFile.session.commit();
    +            if (getLogger().isDebugEnabled()) {
    +                getLogger().debug("Routing " + writeEvent.getTargetUri() + 
" to " + relationship.getName());
    +            }
    +        }
    +        uriFlowFileMap.remove(flowFileUUID);
    +    }
    +
    +    @Override
    +    public final void onTrigger(final ProcessContext context, final 
ProcessSessionFactory sessionFactory) throws ProcessException {
    +        final ProcessSession session = sessionFactory.createSession();
    +        try {
    +            onTrigger(context, session);
    +        } catch (final Throwable t) {
    +            getLogger().error("{} failed to process due to {}; rolling 
back session", new Object[]{this, t});
    +            session.rollback(true);
    +            throw new ProcessException(t);
    +        }
    +    }
    +    /**
    +     * When a FlowFile is received, hand it off to the WriteBatcher so it 
can be written to MarkLogic.
    +     * <p>
    +     * If a FlowFile is not set (possible because of the TriggerWhenEmpty 
annotation), then yield is called on the
    +     * ProcessContext so that Nifi doesn't invoke this method repeatedly 
when nothing is available. Then, a check is
    +     * made to determine if flushAsync should be called on the 
WriteBatcher. This ensures that any batch of documents
    +     * that is smaller than the WriteBatcher's batch size will be flushed 
immediately and not have to wait for more
    +     * FlowFiles to arrive to fill out the batch.
    +     *
    +     */
    +    public void onTrigger(ProcessContext context, ProcessSession session) 
throws ProcessException {
    +        FlowFile flowFile = session.get();
    +        if (flowFile == null) {
    +            context.yield();
    +            if (shouldFlushIfEmpty) {
    +                flushWriteBatcherAsync(this.writeBatcher);
    +            }
    +            shouldFlushIfEmpty = false;
    +        } else {
    +            shouldFlushIfEmpty = true;
    +
    +            WriteEvent writeEvent = buildWriteEvent(context, session, 
flowFile);
    +            if (getLogger().isDebugEnabled()) {
    +                getLogger().debug("Writing URI: " + 
writeEvent.getTargetUri());
    +            }
    +            addWriteEvent(this.writeBatcher, writeEvent);
    +        }
    +    }
    +
    +    /*
    +     * Protected so that it can be overridden for unit testing purposes.
    +     */
    +    protected void flushWriteBatcherAsync(WriteBatcher writeBatcher) {
    +        writeBatcher.flushAsync();
    +    }
    +
    +    /*
    +     * Protected so that it can be overridden for unit testing purposes.
    +     */
    +    protected void addWriteEvent(WriteBatcher writeBatcher, WriteEvent 
writeEvent) {
    +        writeBatcher.add(writeEvent);
    +    }
    +
    +    protected WriteEvent buildWriteEvent(ProcessContext context, 
ProcessSession session, FlowFile flowFile) {
    +        String uri = 
flowFile.getAttribute(context.getProperty(URI_ATTRIBUTE_NAME).getValue());
    +        final String prefix = context.getProperty(URI_PREFIX).getValue();
    +        if (prefix != null) {
    +            uri = prefix + uri;
    +        }
    +        final String suffix = context.getProperty(URI_SUFFIX).getValue();
    +        if (suffix != null) {
    +            uri += suffix;
    +        }
    +
    +        DocumentMetadataHandle metadata = new DocumentMetadataHandle();
    +        final String collections = 
context.getProperty(COLLECTIONS).getValue();
    +        if (collections != null) {
    +            metadata.withCollections(collections.split(","));
    +        }
    +        final String permissions = 
context.getProperty(PERMISSIONS).getValue();
    +        if (permissions != null) {
    +            String[] tokens = permissions.split(",");
    +            for (int i = 0; i < tokens.length; i += 2) {
    +                String role = tokens[i];
    +                String capability = tokens[i + 1];
    +                metadata.withPermission(role, 
DocumentMetadataHandle.Capability.getValueOf(capability));
    +            }
    +        }
    +        // Add the flow file UUID for Provenance purposes and for sending 
them
    +        // to the appropriate relationship
    +        String flowFileUUID = 
flowFile.getAttribute(CoreAttributes.UUID.key());
    +        metadata.withMetadataValue("flowFileUUID", flowFileUUID);
    +        // TODO Haven't had luck with wrapping the FlowFile's inputStream 
with an ML InputStreamHandle, so copying everything into a byte array
    --- End diff --
    
    recommend removal of all commented code/sections.  Could file a follow on 
JIRA for such actions if desired

---

[GitHub] nifi pull request #2671: NiFi-5102 - Adding Processors for MarkLogic DB

Reply via email to