Re: [PR] 3166 anomaly detection isolation forest [systemds]

via GitHub Fri, 12 Jan 2024 10:28:57 -0800


sigmaeon commented on code in PR #1980:
URL: https://github.com/apache/systemds/pull/1980#discussion_r1450793640



##########
scripts/staging/isolationForest/test/isolationForestTest.dml:
##########
@@ -0,0 +1,707 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+source("./scripts/staging/isolationForest/isolationForest.dml") as iForest;
+
+# This scripts tests the isolationForest implementation in isolationForest.dml.
+# In particular functions `outlierByIsolationForest` and 
`outlierByIsolationForestApply`
+# as well as sub-routines are tested here.
+# 
---------------------------------------------------------------------------------------------
+
+
+# U N I T   T E S T S
+# 
---------------------------------------------------------------------------------------------
+# 
---------------------------------------------------------------------------------------------
+
+# Utility function for printing test results
+record_test_result = function(String testname, Boolean success, Int t_cnt, 
List[String] fails) 
+  return(Int t_cnt, List[String] fails)
+{
+  t_cnt = t_cnt + 1
+
+  if (success) {
+    print("- Test '"+testname+"' was successful!")
+    fails = fails
+  }
+  else {
+    print("- Test '"+testname+"' failed!")
+    fails = append(fails, testname)
+  }
+}
+
+matrices_equal = function(Matrix[Double] m1, Matrix[Double] m2) 
+  return(Boolean equal)
+{
+  if (ncol(m1) == ncol(m2) & nrow(m1) == nrow(m2)) {
+    inequality_mat = (m1 - m2) > 1e-14
+    equal = sum(inequality_mat) == 0
+  }
+  else
+    equal = FALSE
+}
+
+is_itree_consistent = function(Matrix[Double] M, Matrix[Double] X, Int 
max_depth, Boolean is_subsampled_model = FALSE) 
+  return(Boolean consistent)
+{
+  consistent = TRUE
+  n_nodes = length(M) / 2
+  tree_depth = floor(log(n_nodes + 1, 2)) - 1
+
+  # check if the model crresponds to a full binary tree of depth tree_depth
+  check_full_tree = n_nodes > 1 & tree_depth == floor(log(n_nodes, 2)) & 
tree_depth < floor(log(n_nodes + 2, 2))
+  if (!check_full_tree) print("Inconsistency: Model is no full binary tree!")
+  consistent = consistent & check_full_tree
+
+  # check tree depth
+  check_max_depth = tree_depth <= max_depth
+  if (!check_max_depth) print("Inconsistency: Tree depth exeeds max_depth!")
+  consistent = consistent & check_max_depth
+  
+  # root node has to be a valid internal node
+  root_node_split_feature = as.integer(as.scalar(M[1, 1]))
+  root_node_split_value = as.scalar(M[1, 2])
+  check_first_node = root_node_split_feature > 0 & root_node_split_feature <= 
ncol(X) & 
+    min(X[,root_node_split_feature]) <= root_node_split_value & 
max(X[,root_node_split_feature]) >= root_node_split_value
+  if (!check_first_node) print("Inconsistency: Root node is not a valid 
internal node!")
+  consistent = consistent & check_first_node
+
+  sum_external_node_sizes = 0
+  for (node_start_idx in seq(3, length(M), 2)) {
+    node_entry_1 = as.integer(as.scalar(M[1, node_start_idx]))
+    node_entry_2 = as.double(as.scalar(M[1, node_start_idx + 1]))
+    node_id = (node_start_idx + 1) / 2
+    node_depth = floor(log(node_id, 2))
+    parent_node_id = floor(node_id / 2)
+    parent_node_entry_1 = as.integer(as.scalar(M[1, (parent_node_id * 2)-1]))
+
+    if (node_entry_1 > 0) {
+      # internal node
+      if (node_depth == tree_depth) {
+        print("Inconsistency: Node in last level is not an external node!")
+        consistent = FALSE
+      }
+
+      check_split_feature_exists = node_entry_1 <= ncol(X)
+      if (!check_split_feature_exists) print("Inconsistency: Split-Feature 
index "+node_entry_1+" exceeds number of features!")
+
+      consistent = consistent & check_split_feature_exists
+
+      feature = X[,node_entry_1]
+      check_value_in_range = min(feature) <= node_entry_2 & max(feature) >= 
node_entry_2
+      if (!check_value_in_range) print("Inconsistency: Split-Value " + 
node_entry_2 + " is not in range of the feature "+node_entry_1+"!")
+      consistent = consistent & check_value_in_range
+
+      check_parent_node = parent_node_entry_1 > 0
+      if (!check_parent_node) print("The parent of an internal node has to be 
an internal node!")
+      consistent = consistent & check_parent_node
+    }
+    else if (node_entry_1 == 0) {
+      # external node
+      sum_external_node_sizes = as.integer(sum_external_node_sizes + 
node_entry_2)
+
+      check_parent_node = parent_node_entry_1 > 0
+      if (!check_parent_node) print("The parent of an external node has to be 
an internal node!")
+      consistent = consistent & check_parent_node
+    }
+    else if (node_entry_1 == -1) {
+      # placeholder node (empty node entry)
+      check_empty_node = node_entry_2 == -1
+      if (!check_empty_node) print("A non-node can only have -1 as entries!")
+      consistent = consistent & check_empty_node
+
+      check_parent_node = parent_node_entry_1 <= 0
+      if (!check_parent_node) print("The parent of a non-node can only be 
another non-node or an external!")
+      consistent = consistent & check_parent_node
+    }
+    else {
+      print("Inconsistency: First node-entry invalid!")
+      consistent = FALSE
+    }
+
+  }
+
+  # The summed sizes of leaf nodes needs to be the original number of rows
+  # This does not hold for subsampled models!
+  if (!is_subsampled_model) {
+    check_sum_externals = sum_external_node_sizes == nrow(X)
+    if (!check_sum_externals) print("Sizes in external notes do not sum to the 
number of rows in X!")
+    consistent = consistent & check_sum_externals
+  }
+}
+
+is_iforest_consistent = function(Matrix[Double] M, Matrix[Double] X, Int 
subsampling_size) 
+  return(Boolean consistent)
+{
+  consistent = TRUE
+
+  height_limit = ceil(log(subsampling_size, 2))
+  tree_size = 2*(2^(height_limit+1)-1)
+  for (tree_id in 1:nrow(M)) {
+    M_tree = M[tree_id,]
+    check_tree_size = ncol(M_tree) == tree_size
+    if (!check_tree_size) print("iTree in iForest is does not have the 
expected size!")
+    consistent = consistent & check_tree_size
+
+    check_tree_consistent = is_itree_consistent(M_tree, X, height_limit, TRUE)
+    if (!check_tree_consistent) print("iTree at index "+tree_id+" in iForest 
is inconsistent!")
+    consistent = consistent & check_tree_consistent
+  }
+}
+
+# We need to initialize test_counter using a multiple return statement, 
otherwise we have scoping problems!
+# TODO: This is most likely a bug in which case an issue should be created for 
it

Review Comment:
   Minimal reproducible example:
   ```R
   add_and_return = function(Int i_old, List[String] l_old) 
     return(Int i, List[String] l)
   {
     i = i_old + 1
     if (TRUE)
       l = append(l_old, toString(i))
     else
       l = append(l_old, toString(i))
   }
   
   # Works!
   init = function() return(Int i, List[String] l) {i=0; l=list();}
   [test_i, test_l] = init()
   
   [test_i, test_l] = add_and_return(test_i, test_l)
   [test_i, test_l] = add_and_return(test_i, test_l)
   [test_i, test_l] = add_and_return(test_i, test_l)
   
   print("Expected") 
   print("------------")
   print("test_i: "+test_i)
   print("test_l: "+toString(test_l))
   
   # Does not work!
   test_fail_i = 0
   test_fail_l = list()
   [test_fail_i, test_fail_l] = add_and_return(test_fail_i, test_fail_l)
   [test_fail_i, test_fail_l] = add_and_return(test_fail_i, test_fail_l)
   [test_fail_i, test_fail_l] = add_and_return(test_fail_i, test_fail_l)
   
   print("\nFail") 
   print("------------")
   print("test_i: "+test_fail_i)
   print("test_l: "+toString(test_fail_l))
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@systemds.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] 3166 anomaly detection isolation forest [systemds]

Reply via email to