[ https://issues.apache.org/jira/browse/NIFI-4963?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16502276#comment-16502276 ]
ASF GitHub Bot commented on NIFI-4963: -------------------------------------- Github user prasanthj commented on a diff in the pull request: https://github.com/apache/nifi/pull/2755#discussion_r193176667 --- Diff: nifi-nar-bundles/nifi-hive-bundle/nifi-hive3-processors/src/main/java/org/apache/nifi/processors/hive/PutHive3Streaming.java --- @@ -0,0 +1,548 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi.processors.hive; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hive.streaming.ConnectionError; +import org.apache.hive.streaming.HiveStreamingConnection; +import org.apache.hive.streaming.InvalidTable; +import org.apache.hive.streaming.SerializationError; +import org.apache.hive.streaming.StreamingConnection; +import org.apache.hive.streaming.StreamingException; +import org.apache.hive.streaming.StreamingIOFailure; +import org.apache.hive.streaming.TransactionError; +import org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading; +import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.annotation.lifecycle.OnScheduled; +import org.apache.nifi.annotation.lifecycle.OnStopped; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.expression.ExpressionLanguageScope; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.hadoop.SecurityUtil; +import org.apache.nifi.kerberos.KerberosCredentialsService; +import org.apache.nifi.logging.ComponentLog; +import org.apache.nifi.processor.AbstractProcessor; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.util.StandardValidators; +import org.apache.nifi.processor.util.pattern.DiscontinuedException; +import org.apache.nifi.processor.util.pattern.RollbackOnFailure; +import org.apache.nifi.processors.hadoop.exception.RecordReaderFactoryException; +import org.apache.nifi.serialization.RecordReader; +import org.apache.nifi.serialization.RecordReaderFactory; +import org.apache.nifi.util.StringUtils; +import org.apache.nifi.util.hive.AuthenticationFailedException; +import org.apache.nifi.util.hive.HiveConfigurator; +import org.apache.nifi.util.hive.HiveOptions; +import org.apache.hive.streaming.HiveRecordWriter; +import org.apache.nifi.util.hive.HiveUtils; +import org.apache.nifi.util.hive.ValidationResources; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import static org.apache.nifi.processors.hive.AbstractHive3QLProcessor.ATTR_OUTPUT_TABLES; + +@Tags({"hive", "streaming", "put", "database", "store"}) +@CapabilityDescription("This processor uses Hive Streaming to send flow file records to an Apache Hive 3.0+ table. " + + "The partition values are expected to be the 'last' fields of each record, so if the table is partitioned on column A for example, then the last field in " + + "each record should be field A.") +@WritesAttributes({ + @WritesAttribute(attribute = "hivestreaming.record.count", description = "This attribute is written on the flow files routed to the 'success' " + + "and 'failure' relationships, and contains the number of records from the incoming flow file. All records in a flow file are committed as a single transaction."), + @WritesAttribute(attribute = "query.output.tables", description = "This attribute is written on the flow files routed to the 'success' " + + "and 'failure' relationships, and contains the target table name in 'databaseName.tableName' format.") +}) +@RequiresInstanceClassLoading +public class PutHive3Streaming extends AbstractProcessor { + // Attributes + public static final String HIVE_STREAMING_RECORD_COUNT_ATTR = "hivestreaming.record.count"; + + private static final String CLIENT_CACHE_DISABLED_PROPERTY = "hcatalog.hive.client.cache.disabled"; + + // Properties + static final PropertyDescriptor RECORD_READER = new PropertyDescriptor.Builder() + .name("record-reader") + .displayName("Record Reader") + .description("The service for reading records from incoming flow files.") + .identifiesControllerService(RecordReaderFactory.class) + .required(true) + .build(); + + static final PropertyDescriptor METASTORE_URI = new PropertyDescriptor.Builder() + .name("hive3-stream-metastore-uri") + .displayName("Hive Metastore URI") + .description("The URI location for the Hive Metastore. Note that this is not the location of the Hive Server. The default port for the " + + "Hive metastore is 9043.") + .required(true) + .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) + .addValidator(StandardValidators.URI_VALIDATOR) + .addValidator(StandardValidators.createRegexMatchingValidator(Pattern.compile("(^[^/]+.*[^/]+$|^[^/]+$|^$)"))) // no start with / or end with / + .build(); + + static final PropertyDescriptor HIVE_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder() + .name("hive3-config-resources") + .displayName("Hive Configuration Resources") + .description("A file or comma separated list of files which contains the Hive configuration (hive-site.xml, e.g.). Without this, Hadoop " + + "will search the classpath for a 'hive-site.xml' file or will revert to a default configuration. Note that to enable authentication " + + "with Kerberos e.g., the appropriate properties must be set in the configuration files. Also note that if Max Concurrent Tasks is set " + + "to a number greater than one, the 'hcatalog.hive.client.cache.disabled' property will be forced to 'true' to avoid concurrency issues. " + + "Please see the Hive documentation for more details.") + .required(false) + .addValidator(HiveUtils.createMultipleFilesExistValidator()) + .build(); + + static final PropertyDescriptor DB_NAME = new PropertyDescriptor.Builder() + .name("hive3-stream-database-name") + .displayName("Database Name") + .description("The name of the database in which to put the data.") + .required(true) + .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + static final PropertyDescriptor TABLE_NAME = new PropertyDescriptor.Builder() + .name("hive3-stream-table-name") + .displayName("Table Name") + .description("The name of the database table in which to put the data.") + .required(true) + .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + static final PropertyDescriptor PARTITION_VALUES = new PropertyDescriptor.Builder() + .name("hive3-stream-part-vals") + .displayName("Partition Values") + .description("Specifies a comma-separated list of the values for the partition columns of the target table. If the incoming records all have the same values " + + "for the partition columns, those values can be entered here, resulting in a performance gain. If specified, this property will often contain " + + "Expression Language, for example if PartitionRecord is upstream and two partitions 'name' and 'age' are used, then this property can be set to " + + "${name},${age}.") + .required(false) + .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + static final PropertyDescriptor AUTOCREATE_PARTITIONS = new PropertyDescriptor.Builder() + .name("hive3-stream-autocreate-partition") + .displayName("Auto-Create Partitions") + .description("Flag indicating whether partitions should be automatically created") + .required(true) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .allowableValues("true", "false") + .defaultValue("true") + .build(); + + static final PropertyDescriptor CALL_TIMEOUT = new PropertyDescriptor.Builder() + .name("hive3-stream-call-timeout") + .displayName("Call Timeout") + .description("The number of seconds allowed for a Hive Streaming operation to complete. A value of 0 indicates the processor should wait indefinitely on operations. " + + "Note that although this property supports Expression Language, it will not be evaluated against incoming FlowFile attributes.") + .defaultValue("0") + .required(true) + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY) + .build(); + + static final PropertyDescriptor DISABLE_STREAMING_OPTIMIZATIONS = new PropertyDescriptor.Builder() + .name("hive3-stream-disable-optimizations") + .displayName("Disable Streaming Optimizations") + .description("Whether to disable streaming optimizations. Disabling streaming optimizations will have significant impact to performance and memory consumption.") + .required(true) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .allowableValues("true", "false") + .defaultValue("false") + .build(); + + + static final PropertyDescriptor ROLLBACK_ON_FAILURE = RollbackOnFailure.createRollbackOnFailureProperty( + "NOTE: When an error occurred after a Hive streaming transaction which is derived from the same input FlowFile is already committed," + + " (i.e. a FlowFile contains more records than 'Records per Transaction' and a failure occurred at the 2nd transaction or later)" + + " then the succeeded records will be transferred to 'success' relationship while the original input FlowFile stays in incoming queue." + + " Duplicated records can be created for the succeeded ones when the same FlowFile is processed again."); + + static final PropertyDescriptor KERBEROS_CREDENTIALS_SERVICE = new PropertyDescriptor.Builder() + .name("kerberos-credentials-service") + .displayName("Kerberos Credentials Service") + .description("Specifies the Kerberos Credentials Controller Service that should be used for authenticating with Kerberos") + .identifiesControllerService(KerberosCredentialsService.class) + .required(false) + .build(); + + // Relationships + public static final Relationship REL_SUCCESS = new Relationship.Builder() + .name("success") + .description("A FlowFile containing Avro records routed to this relationship after the record has been successfully transmitted to Hive.") + .build(); + + public static final Relationship REL_FAILURE = new Relationship.Builder() + .name("failure") + .description("A FlowFile containing Avro records routed to this relationship if the record could not be transmitted to Hive.") + .build(); + + public static final Relationship REL_RETRY = new Relationship.Builder() + .name("retry") + .description("The incoming FlowFile is routed to this relationship if its records cannot be transmitted to Hive. Note that " + + "some records may have been processed successfully, they will be routed (as Avro flow files) to the success relationship. " + + "The combination of the retry, success, and failure relationships indicate how many records succeeded and/or failed. This " + + "can be used to provide a retry capability since full rollback is not possible.") + .build(); + + private List<PropertyDescriptor> propertyDescriptors; + private Set<Relationship> relationships; + + protected volatile HiveConfigurator hiveConfigurator = new HiveConfigurator(); + protected volatile UserGroupInformation ugi; + protected volatile HiveConf hiveConfig; + + protected volatile int callTimeout; + protected ExecutorService callTimeoutPool; + protected volatile boolean rollbackOnFailure; + + // Holder of cached Configuration information so validation does not reload the same config over and over + private final AtomicReference<ValidationResources> validationResourceHolder = new AtomicReference<>(); + + @Override + protected void init(ProcessorInitializationContext context) { + List<PropertyDescriptor> props = new ArrayList<>(); + props.add(RECORD_READER); + props.add(METASTORE_URI); + props.add(HIVE_CONFIGURATION_RESOURCES); + props.add(DB_NAME); + props.add(TABLE_NAME); + props.add(PARTITION_VALUES); + props.add(AUTOCREATE_PARTITIONS); + props.add(CALL_TIMEOUT); + props.add(DISABLE_STREAMING_OPTIMIZATIONS); + props.add(ROLLBACK_ON_FAILURE); + props.add(KERBEROS_CREDENTIALS_SERVICE); + + propertyDescriptors = Collections.unmodifiableList(props); + + Set<Relationship> _relationships = new HashSet<>(); + _relationships.add(REL_SUCCESS); + _relationships.add(REL_FAILURE); + _relationships.add(REL_RETRY); + relationships = Collections.unmodifiableSet(_relationships); + } + + @Override + protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { + return propertyDescriptors; + } + + @Override + public Set<Relationship> getRelationships() { + return relationships; + } + + @Override + protected Collection<ValidationResult> customValidate(final ValidationContext validationContext) { + boolean confFileProvided = validationContext.getProperty(HIVE_CONFIGURATION_RESOURCES).isSet(); + + final List<ValidationResult> problems = new ArrayList<>(); + + final KerberosCredentialsService credentialsService = validationContext.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class); + + final String resolvedPrincipal = credentialsService != null ? credentialsService.getPrincipal() : null; + final String resolvedKeytab = credentialsService != null ? credentialsService.getKeytab() : null; + if (confFileProvided) { + final String configFiles = validationContext.getProperty(HIVE_CONFIGURATION_RESOURCES).evaluateAttributeExpressions().getValue(); + problems.addAll(hiveConfigurator.validate(configFiles, resolvedPrincipal, resolvedKeytab, validationResourceHolder, getLogger())); + } + + return problems; + } + + @OnScheduled + public void setup(final ProcessContext context) { + ComponentLog log = getLogger(); + rollbackOnFailure = context.getProperty(ROLLBACK_ON_FAILURE).asBoolean(); + + final String configFiles = context.getProperty(HIVE_CONFIGURATION_RESOURCES).getValue(); + hiveConfig = hiveConfigurator.getConfigurationFromFiles(configFiles); + + // If more than one concurrent task, force 'hcatalog.hive.client.cache.disabled' to true + if (context.getMaxConcurrentTasks() > 1) { + hiveConfig.setBoolean(CLIENT_CACHE_DISABLED_PROPERTY, true); + } + + // add any dynamic properties to the Hive configuration + for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) { + final PropertyDescriptor descriptor = entry.getKey(); + if (descriptor.isDynamic()) { + hiveConfig.set(descriptor.getName(), entry.getValue()); + } + } + + hiveConfigurator.preload(hiveConfig); + + if (SecurityUtil.isSecurityEnabled(hiveConfig)) { + final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class); + + final String resolvedPrincipal = credentialsService.getPrincipal(); + final String resolvedKeytab = credentialsService.getKeytab(); + + log.info("Hive Security Enabled, logging in as principal {} with keytab {}", new Object[]{resolvedPrincipal, resolvedKeytab}); + try { + ugi = hiveConfigurator.authenticate(hiveConfig, resolvedPrincipal, resolvedKeytab); + } catch (AuthenticationFailedException ae) { + throw new ProcessException("Kerberos authentication failed for Hive Streaming", ae); + } + + log.info("Successfully logged in as principal {} with keytab {}", new Object[]{resolvedPrincipal, resolvedKeytab}); + } else { + ugi = null; + } + + callTimeout = context.getProperty(CALL_TIMEOUT).evaluateAttributeExpressions().asInteger() * 1000; // milliseconds + String timeoutName = "put-hive3-streaming-%d"; + this.callTimeoutPool = Executors.newFixedThreadPool(1, + new ThreadFactoryBuilder().setNameFormat(timeoutName).build()); + } + + public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { + FlowFile flowFile = session.get(); + if (flowFile == null) { + return; + } + + final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER).asControllerService(RecordReaderFactory.class); + final String dbName = context.getProperty(DB_NAME).evaluateAttributeExpressions(flowFile).getValue(); + final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue(); + + final ComponentLog log = getLogger(); + final String metastoreUri = context.getProperty(METASTORE_URI).evaluateAttributeExpressions(flowFile).getValue(); + + final String partitionValuesString = context.getProperty(PARTITION_VALUES).evaluateAttributeExpressions(flowFile).getValue(); + final boolean autoCreatePartitions = context.getProperty(AUTOCREATE_PARTITIONS).asBoolean(); + final boolean disableStreamingOptimizations = context.getProperty(DISABLE_STREAMING_OPTIMIZATIONS).asBoolean(); + + HiveOptions o = new HiveOptions(metastoreUri, dbName, tableName) + .withHiveConf(hiveConfig) + .withAutoCreatePartitions(autoCreatePartitions) + .withCallTimeout(callTimeout) + .withStreamingOptimizations(!disableStreamingOptimizations); + + if (!StringUtils.isEmpty(partitionValuesString)) { + List<String> staticPartitionValues = Arrays.stream(partitionValuesString.split(",")).filter(Objects::nonNull).map(String::trim).collect(Collectors.toList()); + o = o.withStaticPartitionValues(staticPartitionValues); + } + + if (SecurityUtil.isSecurityEnabled(hiveConfig)) { + final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class); + o = o.withKerberosPrincipal(credentialsService.getPrincipal()).withKerberosKeytab(credentialsService.getKeytab()); + } + + final HiveOptions options = o; + + // Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore) + ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader(); + Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); + + StreamingConnection hiveStreamingConnection = null; + + try (final InputStream rawIn = session.read(flowFile)) { + long processedRecords = 0L; + final RecordReader reader; + + try (final BufferedInputStream in = new BufferedInputStream(rawIn)) { + + // if we fail to create the RecordReader then we want to route to failure, so we need to + // handle this separately from the other IOExceptions which normally route to retry + try { + reader = recordReaderFactory.createRecordReader(flowFile, in, getLogger()); + } catch (Exception e) { + throw new RecordReaderFactoryException("Unable to create RecordReader", e); + } + + hiveStreamingConnection = makeStreamingConnection(options, reader); + + // Write records to Hive streaming, then commit and close + hiveStreamingConnection.beginTransaction(); + hiveStreamingConnection.write(in); --- End diff -- Looks like we are creating one connection per flow file. I don't know anything about flow file. How many rows does flow file typically have? Recommendation is to avoid making too frequent commits. Commit after 100000 to 1M rows will get best performance. > Add support for Hive 3.0 processors > ----------------------------------- > > Key: NIFI-4963 > URL: https://issues.apache.org/jira/browse/NIFI-4963 > Project: Apache NiFi > Issue Type: New Feature > Components: Extensions > Reporter: Matt Burgess > Assignee: Matt Burgess > Priority: Major > > Apache Hive is working on Hive 3.0, this Jira is to add a bundle of > components (much like the current Hive bundle) that supports Hive 3.0 (and > Apache ORC if necessary). -- This message was sent by Atlassian JIRA (v7.6.3#76005)