Hi all - I recently compared ClamAV's extended BM (v 0.84rc1) to BloomAV for regular signature matching. A tarred version of the slightly modified libclamav (added matcher-bloom.c) is available at:
http://raconsoft.com/products/opensource/libclamav_bloom.tar.gz The test file is a 80 MB file merged from Windows executables like Word, Excel, Fifa, etc.. There are two versions for BloomAV and the one I tarred is the former (1-byte at a time version). The following results are from my AMD XP 3200 +. Regular signatures (28,326) : Extended Boyer-Moore: 11 MB/s (1) BloomAV 1-byte: 89 MB/s (2) BloomAV 4-bytes: 122 MB/s (3) where performance metrics were taken before and after: (1) (ret = cli_bm_scanbuff(buffer, length, virname, root, 0, ftype, -1)) != CL_VIRUS) (2-3) (ret = cli_bloom_filter_scanbuff(buffer, length, virname, root, ftype)) != CL_VIRUS) The algorithm first checks the input file against a bloom filter, and if there is a match, it takes a hash value, and looks up the hash in an STL map or GNU libavl's red-black tree (the former performs ~10 MB/s better than the latter). False positive rates against the sample executable for (2) and (3) are 0.25% and 4.3% respectively (this is how often we hit the STL map). The tree can easily be replaced by a data structure that performs better (big arrays maybe), I just found maps to be convenient at a first glance. BloomAV doesn't handle polymorphic viruses yet, and uses AC for polymorphic signature matching (as does Boyer-Moore?). It also has a BETA setting defining the shortest signature it will accept (currently set to 7). I'm not sure if clamav-devel@ accepts attachments, but I will try to attach matcher-bloom.c for interested readers (easier than getting the tar file). In terms of functionality, I tested BloomAV against 30 test files I generated from ClamAV's database, and everything worked fine. What do ClamAV developers think? I have more results if people are interested. Would this algorithm be worth playing around with? Thanks, Ozgun.
#include "clamav.h" #include "matcher-bloom.h" #include "rb.h" // optimum values for CPUs with 256 KB cache sizes const int BETA = 7; const int BLOOM_FILTER_SIZE = 4*1024*1024; bit *ba_new(const elem_t nelems) { size_t howmany = NELEM(nelems,(BITS_SZ)); return ((bit *)calloc(howmany, sizeof(bit))); } void ba_assign(bit arr[], elem_t elem, const boolean value) { if (value) arr[elem / BITS_SZ] |= (1 << (elem % BITS_SZ)); else arr[elem / BITS_SZ] &= ~(1 << (elem % BITS_SZ)); } inline boolean ba_value(const bit arr[], const elem_t elem) { return( (arr[elem / BITS_SZ] & (1 << (elem % BITS_SZ))) ? TRUE:FALSE ); } void init_bloom_filter(bit *to_init) { int i; for (i = 0; i < BLOOM_FILTER_SIZE; i++) ba_assign(to_init, i, FALSE); } // 4 very fast hash functions: Mask (literally inlined), xor, fasthash and // sdbm. // len can NOT be shorter than 2+4 = 6 bytes. This puts a lower limit on how // small Beta can be. There should be a check in this function to stop it from // getting called with len < 6, but it's omitted for performance reasons (an // extra branch instruction is very costly here). inline unsigned int xor_hash(unsigned char *buf, size_t len) { unsigned int hash_val = 0; hash_val ^= *((unsigned int *) (buf)); hash_val ^= *((unsigned int *) (buf+1)); hash_val ^= *((unsigned int *) (buf+2)); return hash_val; } // fast hash from hashlib.c inline unsigned long fast_hash(unsigned char *buf, size_t len) { unsigned long hval = 0; unsigned char *bptr = buf; /* start of buffer */ unsigned char *bend = bptr + len; /* beyond end of buffer */ while (bptr < bend) hval = hval * 31UL + *bptr++; return hval; } // sdbm inline unsigned long sdbm(unsigned char *buf, size_t len) { unsigned long hval = 0; int c; unsigned char *bptr = buf; /* start of buffer */ unsigned char *bend = bptr + len; /* beyond end of buffer */ while (bptr < bend) { c = *bptr++; hval = c + (hval << 6) + (hval << 16) - hval; } return hval; } boolean is_possible_match(bit *filter_to_search, unsigned char* string_to_match) { int mask1024 = 0x3FFFFF; if (ba_value(filter_to_search, (fast_hash(string_to_match, BETA) & mask1024)) == FALSE) return FALSE; if (ba_value(filter_to_search, (sdbm(string_to_match, BETA) & mask1024)) == FALSE) return FALSE; return TRUE; } // Inserts entries into the bloom-filter void set_bloom_filter_value(bit *filter_to_set, unsigned char *string_to_hash) { int mask1024 = 0x3FFFFF; ba_assign(filter_to_set, ((*(unsigned int *) string_to_hash) & mask1024), TRUE); ba_assign(filter_to_set, (xor_hash(string_to_hash, BETA) & mask1024), TRUE); ba_assign(filter_to_set, (fast_hash(string_to_hash, BETA) & mask1024), TRUE); ba_assign(filter_to_set, (sdbm(string_to_hash, BETA) & mask1024), TRUE); } void insert_into_rb_table(struct rb_table* signature_tree, const char *virname, const char *hexsig, unsigned short type) { struct cli_bloom_node* bloom_node = (struct cli_bloom_node*) malloc(sizeof(struct cli_bloom_node)); bloom_node->signature_key = (char*) cli_hex2str(hexsig); bloom_node->rb_key = sdbm(bloom_node->signature_key, BETA); bloom_node->signature_length = strlen(hexsig) / 2; bloom_node->signature_name = strdup(virname); bloom_node->next = NULL; // if item was already in the rb-tree void *existing_node = NULL; struct cli_bloom_node *str_existing_node = NULL; if ((existing_node = rb_find(signature_tree, (void *) (bloom_node))) != NULL) { str_existing_node = (struct cli_bloom_node*) existing_node; str_existing_node->next = bloom_node; } else { rb_probe(signature_tree, (void*) (bloom_node)); } // Error checking for memory allocations omitted for now. // return 0; } // returns 1 if virus is found // 0 otherwise int lookup_in_rb_table(const char* rest_of_file, int remaining_size, char **virname, struct rb_table *signature_tree) { int tree_key = sdbm((unsigned char*) rest_of_file, BETA); void *existing_node = rb_find(signature_tree, (void *) (&tree_key)); struct cli_bloom_node *iter_node = NULL; if (existing_node) { iter_node = (struct cli_bloom_node*) existing_node; while (iter_node) { if (remaining_size < iter_node->signature_length) { iter_node = iter_node->next; continue; } if (!memcmp(rest_of_file, iter_node->signature_key, iter_node->signature_length)) { (*virname) = iter_node->signature_name; return 1; } iter_node = iter_node->next; } } // else there is a false positive from Bloom filter, gather stats // else { } return 0; } void setup_bloom_structures(bit** bloom_filter, struct rb_table** signature_tree) { // Add Error checking into this function. (*bloom_filter) = ba_new(BLOOM_FILTER_SIZE); init_bloom_filter(*bloom_filter); // NULL allocator defaults to rb_allocator_default. (*signature_tree) = rb_create(compare_ints, NULL, NULL); } void clean_bloom_structures(bit* bloom_filter, struct rb_table* signature_tree) { free(bloom_filter); // Leaks memory for now, walk over the rb tree and deallocate all memory when // the program terminates. rb_destroy(signature_tree, NULL); } int cli_bloom_filter_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_node *root, unsigned short ftype) { bit* bloom_filter = root->bloom_filter; struct rb_table* signature_tree = root->signature_tree; // this is where we do the scanning. char *file_iterator = (char *) buffer; long int file_size_ctr = 0; while (file_size_ctr < length) { // The two fastest hash functions are literally inlined. if ((ba_value(bloom_filter, ((*(unsigned int *) file_iterator) & 0x3FFFFF)) == TRUE) && (ba_value(bloom_filter,(xor_hash(file_iterator, BETA) & 0x3FFFFF)) == TRUE)) { if (is_possible_match(bloom_filter, file_iterator) == TRUE) { if (lookup_in_rb_table((const char*) file_iterator, length - file_size_ctr, virname, signature_tree)) { return CL_VIRUS; } } } file_size_ctr++; file_iterator++; } }
_______________________________________________ http://lurker.clamav.net/list/clamav-devel.html