METRON-1366: Add an entropy stellar function (cstella via mmiklavc) closes apache/metron#872
Project: http://git-wip-us.apache.org/repos/asf/metron/repo Commit: http://git-wip-us.apache.org/repos/asf/metron/commit/e127797e Tree: http://git-wip-us.apache.org/repos/asf/metron/tree/e127797e Diff: http://git-wip-us.apache.org/repos/asf/metron/diff/e127797e Branch: refs/heads/feature/METRON-1211-extensions-parsers-gradual Commit: e127797e63ecd3af2d9aa8fc062e36e28015aac6 Parents: 4324375 Author: cstella <[email protected]> Authored: Thu Jan 11 14:45:33 2018 -0700 Committer: Michael Miklavcic <[email protected]> Committed: Thu Jan 11 14:45:33 2018 -0700 ---------------------------------------------------------------------- metron-analytics/metron-statistics/README.md | 7 +++ .../InformationTheoryFunctions.java | 53 ++++++++++++++++++++ .../InformationTheoryUtil.java | 52 +++++++++++++++++++ .../informationtheory/EntropyTest.java | 46 +++++++++++++++++ metron-stellar/stellar-common/README.md | 1 + 5 files changed, 159 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/README.md ---------------------------------------------------------------------- diff --git a/metron-analytics/metron-statistics/README.md b/metron-analytics/metron-statistics/README.md index 1b83245..d1109ca 100644 --- a/metron-analytics/metron-statistics/README.md +++ b/metron-analytics/metron-statistics/README.md @@ -217,6 +217,13 @@ functions can be used from everywhere where Stellar is used. * stats - The Stellar statistics object * Returns: The variance of the values in the window or NaN if the statistics object is null. +### Information Theory Functions + +#### `IT_ENTROPY` + * Description: Computes the base-2 entropy of a multiset + * Input: + * input - a multiset (a map of objects to counts). + * Returns: The [base-2 entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition) of the count . The unit of this is bits. ### Statistical Outlier Detection http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java ---------------------------------------------------------------------- diff --git a/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java new file mode 100644 index 0000000..705bf7a --- /dev/null +++ b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryFunctions.java @@ -0,0 +1,53 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.apache.metron.statistics.informationtheory; + +import org.apache.metron.stellar.dsl.BaseStellarFunction; +import org.apache.metron.stellar.dsl.Stellar; + +import java.util.List; +import java.util.Map; + +public class InformationTheoryFunctions { + @Stellar( namespace="IT" + , name="ENTROPY" + , description = "Computes the base-2 entropy of a multiset" + , params = { "input - a multiset (a map of objects to counts)" } + , returns = "The [base-2 entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition) of the count . The unit of this is bits." + ) + public static class Entropy extends BaseStellarFunction { + + @Override + public Object apply(List<Object> args) { + if(args.isEmpty()) { + throw new IllegalArgumentException("IT_ENTROPY expects exactly one argument."); + } + Object inputObj = args.get(0); + if(inputObj == null) { + return null; + } + if(!(inputObj instanceof Map)) { + throw new IllegalArgumentException("IT_ENTROPY expects exactly one argument and expects it to be a map of counts (e.g. Map<?, Integer>)"); + } + Map<?, Integer> countMap = (Map<?, Integer>) inputObj; + return InformationTheoryUtil.INSTANCE.bitEntropy(countMap); + } + } +} http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java ---------------------------------------------------------------------- diff --git a/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java new file mode 100644 index 0000000..94593ec --- /dev/null +++ b/metron-analytics/metron-statistics/src/main/java/org/apache/metron/statistics/informationtheory/InformationTheoryUtil.java @@ -0,0 +1,52 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.apache.metron.statistics.informationtheory; + +import java.util.Map; + +public enum InformationTheoryUtil { + INSTANCE; + private static final double LOG2 = Math.log(2); + + public double entropy(Map<?, Integer> counts, double logOfBase) { + double ret = 0.0; + int n = 0; + if(counts == null || counts.isEmpty()) { + return ret; + } + for(Integer f : counts.values()) { + n+=f; + } + + for(Integer f : counts.values()) { + double p = f.doubleValue()/n; + ret -= p * Math.log(p) / logOfBase; + } + return ret; + } + + public double entropy(Map<?, Integer> counts, int base) { + return entropy(counts, Math.log(base)); + } + + public double bitEntropy(Map<?, Integer> counts) { + return entropy(counts, LOG2); + } +} http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java ---------------------------------------------------------------------- diff --git a/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java b/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java new file mode 100644 index 0000000..a168b2e --- /dev/null +++ b/metron-analytics/metron-statistics/src/test/java/org/apache/metron/statistics/informationtheory/EntropyTest.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.metron.statistics.informationtheory; + +import com.google.common.collect.ImmutableMap; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; + +import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.run; + +public class EntropyTest { + @Test + public void entropyTest() throws Exception { + //test empty collection + Assert.assertEquals(0.0, (Double) run("IT_ENTROPY({})", new HashMap<>()), 0.0); + + /* + Now consider the string aaaaaaaaaabbbbbccccc or 10 a's followed by 5 b's and 5 c's. + The probabilities of each character is as follows: + p(a) = 1/2 + p(b) = 1/4 + p(c) = 1/4 + so the shannon entropy should be + -p(a)*log_2(p(a)) - p(b)*log_2(p(b)) - p(c)*log_2(p(c)) = + -0.5*-1 - 0.25*-2 - 0.25*-2 = 1.5 + */ + Assert.assertEquals(1.5, (Double) run("IT_ENTROPY({ 'a' : 10, 'b' : 5, 'c' : 5} )", new HashMap<>()), 0.0); + } +} http://git-wip-us.apache.org/repos/asf/metron/blob/e127797e/metron-stellar/stellar-common/README.md ---------------------------------------------------------------------- diff --git a/metron-stellar/stellar-common/README.md b/metron-stellar/stellar-common/README.md index 2ef81e8..4796889 100644 --- a/metron-stellar/stellar-common/README.md +++ b/metron-stellar/stellar-common/README.md @@ -189,6 +189,7 @@ Where: | [ `HLLP_INIT`](../../metron-analytics/metron-statistics#hllp_init) | | [ `HLLP_MERGE`](../../metron-analytics/metron-statistics#hllp_merge) | | [ `IN_SUBNET`](#in_subnet) | +| [ `IT_ENTROPY`](../../metron-analytics/metron-statistics#it_entropy) | | [ `IS_DATE`](#is_date) | | [ `IS_ENCODING`](#is_encoding) | | [ `IS_DOMAIN`](#is_domain) |
