METRON-1366: Add an entropy stellar function (cstella via mmiklavc) closes 
apache/metron#872


Project: http://git-wip-us.apache.org/repos/asf/metron/repo
Commit: http://git-wip-us.apache.org/repos/asf/metron/commit/e127797e
Tree: http://git-wip-us.apache.org/repos/asf/metron/tree/e127797e
Diff: http://git-wip-us.apache.org/repos/asf/metron/diff/e127797e

Branch: refs/heads/feature/METRON-1211-extensions-parsers-gradual
Commit: e127797e63ecd3af2d9aa8fc062e36e28015aac6
Parents: 4324375
Author: cstella <[email protected]>
Authored: Thu Jan 11 14:45:33 2018 -0700
Committer: Michael Miklavcic <[email protected]>
Committed: Thu Jan 11 14:45:33 2018 -0700

----------------------------------------------------------------------
 metron-analytics/metron-statistics/README.md    |  7 +++
 .../InformationTheoryFunctions.java             | 53 ++++++++++++++++++++
 .../InformationTheoryUtil.java                  | 52 +++++++++++++++++++
 .../informationtheory/EntropyTest.java          | 46 +++++++++++++++++
 metron-stellar/stellar-common/README.md         |  1 +
 5 files changed, 159 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/README.md
----------------------------------------------------------------------
diff --git a/metron-analytics/metron-statistics/README.md 
b/metron-analytics/metron-statistics/README.md
index 1b83245..d1109ca 100644
--- a/metron-analytics/metron-statistics/README.md
+++ b/metron-analytics/metron-statistics/README.md
@@ -217,6 +217,13 @@ functions can be used from everywhere where Stellar is 
used.
     * stats - The Stellar statistics object
   * Returns: The variance of the values in the window or NaN if the statistics 
object is null.
 
+### Information Theory Functions
+
+#### `IT_ENTROPY`
+  * Description: Computes the base-2 entropy of a multiset
+  * Input:
+    * input - a multiset (a map of objects to counts).
+  * Returns: The [base-2 
entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition) 
of the count .  The unit of this is bits.
 
 ### Statistical Outlier Detection
   

http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java
----------------------------------------------------------------------
diff --git 
a/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java
 
b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java
new file mode 100644
index 0000000..705bf7a
--- /dev/null
+++ 
b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java
@@ -0,0 +1,53 @@
+/*
+ *
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+package org.apache.metron.statistics.informationtheory;
+
+import org.apache.metron.stellar.dsl.BaseStellarFunction;
+import org.apache.metron.stellar.dsl.Stellar;
+
+import java.util.List;
+import java.util.Map;
+
+public class InformationTheoryFunctions {
+  @Stellar( namespace="IT"
+          , name="ENTROPY"
+          , description = "Computes the base-2 entropy of a multiset"
+          , params = { "input - a multiset (a map of objects to counts)" }
+          , returns = "The [base-2 
entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition) 
of the count .  The unit of this is bits."
+  )
+  public static class Entropy extends BaseStellarFunction {
+
+    @Override
+    public Object apply(List<Object> args) {
+      if(args.isEmpty()) {
+        throw new IllegalArgumentException("IT_ENTROPY expects exactly one 
argument.");
+      }
+      Object inputObj = args.get(0);
+      if(inputObj == null) {
+        return null;
+      }
+      if(!(inputObj instanceof Map)) {
+        throw new IllegalArgumentException("IT_ENTROPY expects exactly one 
argument and expects it to be a map of counts (e.g. Map<?, Integer>)");
+      }
+      Map<?, Integer> countMap = (Map<?, Integer>) inputObj;
+      return InformationTheoryUtil.INSTANCE.bitEntropy(countMap);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java
----------------------------------------------------------------------
diff --git 
a/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java
 
b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java
new file mode 100644
index 0000000..94593ec
--- /dev/null
+++ 
b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java
@@ -0,0 +1,52 @@
+/*
+ *
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+package org.apache.metron.statistics.informationtheory;
+
+import java.util.Map;
+
+public enum InformationTheoryUtil {
+  INSTANCE;
+  private static final double LOG2 = Math.log(2);
+
+  public double entropy(Map<?, Integer> counts, double logOfBase) {
+    double ret = 0.0;
+    int n = 0;
+    if(counts == null || counts.isEmpty()) {
+      return ret;
+    }
+    for(Integer f : counts.values()) {
+      n+=f;
+    }
+
+    for(Integer f : counts.values()) {
+      double p = f.doubleValue()/n;
+      ret -= p * Math.log(p) / logOfBase;
+    }
+    return ret;
+  }
+
+  public double entropy(Map<?, Integer> counts, int base) {
+    return entropy(counts, Math.log(base));
+  }
+
+  public double bitEntropy(Map<?, Integer> counts) {
+    return entropy(counts, LOG2);
+  }
+}

http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java
----------------------------------------------------------------------
diff --git 
a/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java
 
b/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java
new file mode 100644
index 0000000..a168b2e
--- /dev/null
+++ 
b/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.metron.statistics.informationtheory;
+
+import com.google.common.collect.ImmutableMap;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.run;
+
+public class EntropyTest {
+  @Test
+  public void entropyTest() throws Exception {
+    //test empty collection
+    Assert.assertEquals(0.0, (Double) run("IT_ENTROPY({})", new HashMap<>()), 
0.0);
+
+    /*
+    Now consider the string aaaaaaaaaabbbbbccccc or 10 a's followed by 5 b's 
and 5 c's.
+    The probabilities of each character is as follows:
+    p(a) = 1/2
+    p(b) = 1/4
+    p(c) = 1/4
+    so the shannon entropy should be
+      -p(a)*log_2(p(a)) - p(b)*log_2(p(b)) - p(c)*log_2(p(c)) =
+      -0.5*-1 - 0.25*-2 - 0.25*-2 = 1.5
+     */
+    Assert.assertEquals(1.5, (Double) run("IT_ENTROPY({ 'a' : 10, 'b' : 5, 'c' 
: 5} )", new HashMap<>()), 0.0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-stellar/stellar-common/README.md
----------------------------------------------------------------------
diff --git a/metron-stellar/stellar-common/README.md 
b/metron-stellar/stellar-common/README.md
index 2ef81e8..4796889 100644
--- a/metron-stellar/stellar-common/README.md
+++ b/metron-stellar/stellar-common/README.md
@@ -189,6 +189,7 @@ Where:
 | [ `HLLP_INIT`](../../metron-analytics/metron-statistics#hllp_init)           
                      |
 | [ `HLLP_MERGE`](../../metron-analytics/metron-statistics#hllp_merge)         
                      |
 | [ `IN_SUBNET`](#in_subnet)                                                   
                      |
+| [ `IT_ENTROPY`](../../metron-analytics/metron-statistics#it_entropy)         
                      |
 | [ `IS_DATE`](#is_date)                                                       
                      |
 | [ `IS_ENCODING`](#is_encoding)                                               
                      |
 | [ `IS_DOMAIN`](#is_domain)                                                   
                      |

Reply via email to