[
https://issues.apache.org/jira/browse/NIFI-3726?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15985742#comment-15985742
]
ASF GitHub Bot commented on NIFI-3726:
--------------------------------------
Github user alopresto commented on a diff in the pull request:
https://github.com/apache/nifi/pull/1692#discussion_r113586324
--- Diff:
nifi-nar-bundles/nifi-cybersecurity-bundle/nifi-cybersecurity-processors/src/main/java/org/apache/nifi/processors/cybersecurity/CompareFuzzyHash.java
---
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nifi.processors.cybersecurity;
+
+import org.apache.nifi.annotation.behavior.EventDriven;
+import org.apache.nifi.annotation.behavior.InputRequirement;
+import org.apache.nifi.annotation.behavior.SideEffectFree;
+import org.apache.nifi.annotation.behavior.SupportsBatching;
+import org.apache.nifi.annotation.behavior.WritesAttribute;
+import org.apache.nifi.annotation.behavior.WritesAttributes;
+import org.apache.nifi.annotation.documentation.CapabilityDescription;
+import org.apache.nifi.annotation.documentation.SeeAlso;
+import org.apache.nifi.annotation.documentation.Tags;
+import org.apache.nifi.annotation.lifecycle.OnScheduled;
+import org.apache.nifi.components.AllowableValue;
+import org.apache.nifi.components.PropertyDescriptor;
+import org.apache.nifi.flowfile.FlowFile;
+import org.apache.nifi.logging.ComponentLog;
+import org.apache.nifi.processor.ProcessContext;
+import org.apache.nifi.processor.ProcessSession;
+import org.apache.nifi.processor.ProcessorInitializationContext;
+import org.apache.nifi.processor.Relationship;
+import org.apache.nifi.processor.exception.ProcessException;
+import org.apache.nifi.processor.util.StandardValidators;
+import org.apache.nifi.processors.cybersecurity.matchers.FuzzyHashMatcher;
+import org.apache.nifi.processors.cybersecurity.matchers.SSDeepHashMatcher;
+import org.apache.nifi.processors.cybersecurity.matchers.TLSHHashMatcher;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+
+@EventDriven
+@SideEffectFree
+@SupportsBatching
+@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
+@SeeAlso({FuzzyHashContent.class})
+@Tags({"hashing", "fuzzy-hashing", "cyber-security"})
+@CapabilityDescription("Compares an attribute containing a Fuzzy Hash
against a file containing a list of fuzzy hashes, " +
+ "appending an attribute to the FlowFile in case of a successful
match.")
+
+@WritesAttributes({
+ @WritesAttribute(attribute = "XXXX.N.match", description = "The
match that resembles the attribute specified " +
+ "by the <Hash Attribute Name> property. Note that: 'XXX'
gets replaced with the <Hash Attribute Name>"),
+ @WritesAttribute(attribute = "XXXX.N.similarity", description =
"The similarity score between this flowfile" +
+ "and its match of the same number N. Note that: 'XXX' gets
replaced with the <Hash Attribute Name>")})
+
+public class CompareFuzzyHash extends AbstractFuzzyHashProcessor {
+ public static final AllowableValue singleMatch = new AllowableValue(
+ "single",
+ "single",
+ "Send FlowFile to matched after the first match above
threshold");
+ public static final AllowableValue multiMatch = new AllowableValue(
+ "multi-match",
+ "multi-match",
+ "Iterate full list of hashes before deciding to send FlowFile
to matched or unmatched");
+
+ public static final PropertyDescriptor HASH_LIST_FILE = new
PropertyDescriptor.Builder()
+ .name("HASH_LIST_FILE")
+ .displayName("Hash List source file")
+ .description("Path to the file containing hashes to be
validated against")
+ .required(true)
+ .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
+ .build();
+
+ // Note we add a PropertyDescriptor HASH_ALGORITHM and ATTRIBUTE_NAME
from parent class
+
+ public static final PropertyDescriptor MATCH_THRESHOLD = new
PropertyDescriptor.Builder()
+ // Note that while both TLSH and SSDeep seems to return int,
we treat them as double in code.
+ // The rationale behind being the expectation that other
algorithms thatmay return double values
+ // may be added to the processor later on.
+ .name("MATCH_THRESHOLD")
+ .displayName("Match threshold")
+ .description("The similarity score must exceed or be equal to
in order for" +
+ "match to be considered true. Refer to Additional
Information for differences between TLSH " +
+ "and SSDEEP scores and how they relate to this
property.")
+ .required(true)
+ .addValidator(StandardValidators.NUMBER_VALIDATOR)
+ .build();
+
+ public static final PropertyDescriptor MATCHING_MODE = new
PropertyDescriptor.Builder()
+ .name("MATCHING_MODE")
+ .displayName("Matching mode")
+ .description("Defines if the Processor should try to match as
many entries as possible (" + multiMatch.getDisplayName() +
+ ") or if it should stio after the first match (" +
singleMatch.getDisplayName() + ")")
+ .required(true)
+ .allowableValues(singleMatch,multiMatch)
+ .defaultValue(singleMatch.getValue())
+ .build();
+
+ public static final Relationship REL_FOUND = new Relationship.Builder()
+ .name("found")
+ .description("Any FlowFile that is successfully matched to an
existing hash will be sent to this Relationship.")
+ .build();
+
+ public static final Relationship REL_NOT_FOUND = new
Relationship.Builder()
+ .name("not found")
+ .description("Any FlowFile that cannot be matched to an
existing hash will be sent to this Relationship.")
+ .build();
+
+ public static final Relationship REL_FAILURE = new
Relationship.Builder()
+ .name("failure")
+ .description("Any FlowFile that cannot be matched, e.g. (lacks
the attribute) will be sent to this Relationship.")
+ .build();
+
+ @Override
+ protected void init(final ProcessorInitializationContext context) {
+ final List<PropertyDescriptor> descriptors = new
ArrayList<PropertyDescriptor>();
+ descriptors.add(HASH_LIST_FILE);
+ // As mentioned above, add the PropertyDescriptor HASH_ALGORITHM
and ATTRIBUTE_NAME from parent class
+ descriptors.add(HASH_ALGORITHM);
+ descriptors.add(ATTRIBUTE_NAME);
+ descriptors.add(MATCH_THRESHOLD);
+ descriptors.add(MATCHING_MODE);
+ this.descriptors = Collections.unmodifiableList(descriptors);
+
+ final Set<Relationship> relationships = new
HashSet<Relationship>();
+ relationships.add(REL_FOUND);
+ relationships.add(REL_NOT_FOUND);
+ relationships.add(REL_FAILURE);
+ this.relationships = Collections.unmodifiableSet(relationships);
+ }
+
+ @Override
+ public Set<Relationship> getRelationships() {
+ return this.relationships;
+ }
+
+ @Override
+ public final List<PropertyDescriptor>
getSupportedPropertyDescriptors() {
+ return descriptors;
+ }
+
+ @OnScheduled
+ public void onScheduled(final ProcessContext context) {
+ }
+
+ @Override
+ public void onTrigger(ProcessContext context, ProcessSession session)
throws ProcessException {
+
+ FlowFile flowFile = session.get();
+ if (flowFile == null) {
+ return;
+ }
+
+ final ComponentLog logger = getLogger();
+ String algorithm = context.getProperty(HASH_ALGORITHM).getValue();
+
+ String inputHash =
flowFile.getAttribute(context.getProperty(ATTRIBUTE_NAME).getValue());
+
+ if (inputHash == null) {
+ getLogger().info("FlowFile {} lacks the required '{}'
attribute, routing to failure.",
+ new Object[]{flowFile,
context.getProperty(ATTRIBUTE_NAME).getValue() });
+ session.transfer(flowFile, REL_FAILURE);
+ return;
+ }
+
+ FuzzyHashMatcher fuzzyHashMatcher = null;
+
+ switch (algorithm) {
+ case tlsh:
+ fuzzyHashMatcher = new TLSHHashMatcher(getLogger());
+ break;
+ case ssdeep:
+ fuzzyHashMatcher = new SSDeepHashMatcher(getLogger());
+ break;
+ }
+
+ if (fuzzyHashMatcher.isValidHash(inputHash) == false) {
+ // and if that is the case we log
+ logger.error("Invalid hash provided. Sending to failure");
+ // and send to failure
+ session.transfer(flowFile, REL_FAILURE);
+ session.commit();
+ return;
+ }
+
+ File file = new
File(context.getProperty(HASH_LIST_FILE).getValue());
+
+ double similarity = 0;
+ double matchThreshold =
context.getProperty(MATCH_THRESHOLD).asDouble();
+
+ try {
+ Map<String, Double> matched = new ConcurrentHashMap<String,
Double>();
+ FileInputStream fileInputStream = new FileInputStream(file);
+ BufferedReader reader = new BufferedReader(new
InputStreamReader(fileInputStream));
+
+ // If SSdeep skip the first line (as the usual format used by
other tools add a header line
--- End diff --
I think the details of the file-parsing should be delegated & encapsulated
by the implementations. A method in the interface like `Map<String, Double>
getMatches(File definitionFile);` would then hide the various line split
delimiters, etc. If you want to keep the loop logic here in the main processor,
at least implement `void prepareReader(BufferedReader reader);` (no-op in most,
skip a line in SSDeep), and `String getHashToCompare(String line);` -- the
general idea is to remove the need for `switch` statements to handle custom
per-implementation logic from this level.
> Create FuzzyHash comparison processor
> -------------------------------------
>
> Key: NIFI-3726
> URL: https://issues.apache.org/jira/browse/NIFI-3726
> Project: Apache NiFi
> Issue Type: Improvement
> Reporter: Andre F de Miranda
> Assignee: Andre F de Miranda
>
> Now that NiFi cyber-security package supports "Fuzzy Hashing" it may be a
> good idea to support a processor that makes use of it for comparison and
> routing of matches
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)