Ozgun Erdogan
Wed, 20 Jul 2005 23:27:47 -0700
Hi all - I recently compared ClamAV's extended BM (v 0.84rc1) to BloomAV for regular signature matching. A tarred version of the slightly modified libclamav (added matcher-bloom.c) is available at:
http://raconsoft.com/products/opensource/libclamav_bloom.tar.gz The test file is a 80 MB file merged from Windows executables like Word, Excel, Fifa, etc.. There are two versions for BloomAV and the one I tarred is the former (1-byte at a time version). The following results are from my AMD XP 3200 +. Regular signatures (28,326) : Extended Boyer-Moore: 11 MB/s (1) BloomAV 1-byte: 89 MB/s (2) BloomAV 4-bytes: 122 MB/s (3) where performance metrics were taken before and after: (1) (ret = cli_bm_scanbuff(buffer, length, virname, root, 0, ftype, -1)) != CL_VIRUS) (2-3) (ret = cli_bloom_filter_scanbuff(buffer, length, virname, root, ftype)) != CL_VIRUS) The algorithm first checks the input file against a bloom filter, and if there is a match, it takes a hash value, and looks up the hash in an STL map or GNU libavl's red-black tree (the former performs ~10 MB/s better than the latter). False positive rates against the sample executable for (2) and (3) are 0.25% and 4.3% respectively (this is how often we hit the STL map). The tree can easily be replaced by a data structure that performs better (big arrays maybe), I just found maps to be convenient at a first glance. BloomAV doesn't handle polymorphic viruses yet, and uses AC for polymorphic signature matching (as does Boyer-Moore?). It also has a BETA setting defining the shortest signature it will accept (currently set to 7). I'm not sure if clamav-devel@ accepts attachments, but I will try to attach matcher-bloom.c for interested readers (easier than getting the tar file). In terms of functionality, I tested BloomAV against 30 test files I generated from ClamAV's database, and everything worked fine. What do ClamAV developers think? I have more results if people are interested. Would this algorithm be worth playing around with? Thanks, Ozgun.
#include "clamav.h"
#include "matcher-bloom.h"
#include "rb.h"
// optimum values for CPUs with 256 KB cache sizes
const int BETA = 7;
const int BLOOM_FILTER_SIZE = 4*1024*1024;
bit *ba_new(const elem_t nelems)
{
size_t howmany = NELEM(nelems,(BITS_SZ));
return ((bit *)calloc(howmany, sizeof(bit)));
}
void ba_assign(bit arr[], elem_t elem, const boolean value)
{
if (value)
arr[elem / BITS_SZ] |= (1 << (elem % BITS_SZ));
else
arr[elem / BITS_SZ] &= ~(1 << (elem % BITS_SZ));
}
inline
boolean ba_value(const bit arr[], const elem_t elem)
{
return( (arr[elem / BITS_SZ] & (1 << (elem % BITS_SZ))) ? TRUE:FALSE );
}
void init_bloom_filter(bit *to_init)
{
int i;
for (i = 0; i < BLOOM_FILTER_SIZE; i++)
ba_assign(to_init, i, FALSE);
}
// 4 very fast hash functions: Mask (literally inlined), xor, fasthash and
// sdbm.
// len can NOT be shorter than 2+4 = 6 bytes. This puts a lower limit on how
// small Beta can be. There should be a check in this function to stop it from
// getting called with len < 6, but it's omitted for performance reasons (an
// extra branch instruction is very costly here).
inline
unsigned int xor_hash(unsigned char *buf, size_t len)
{
unsigned int hash_val = 0;
hash_val ^= *((unsigned int *) (buf));
hash_val ^= *((unsigned int *) (buf+1));
hash_val ^= *((unsigned int *) (buf+2));
return hash_val;
}
// fast hash from hashlib.c
inline
unsigned long fast_hash(unsigned char *buf, size_t len)
{
unsigned long hval = 0;
unsigned char *bptr = buf; /* start of buffer */
unsigned char *bend = bptr + len; /* beyond end of buffer */
while (bptr < bend)
hval = hval * 31UL + *bptr++;
return hval;
}
// sdbm
inline
unsigned long sdbm(unsigned char *buf, size_t len)
{
unsigned long hval = 0;
int c;
unsigned char *bptr = buf; /* start of buffer */
unsigned char *bend = bptr + len; /* beyond end of buffer */
while (bptr < bend)
{
c = *bptr++;
hval = c + (hval << 6) + (hval << 16) - hval;
}
return hval;
}
boolean is_possible_match(bit *filter_to_search, unsigned char* string_to_match)
{
int mask1024 = 0x3FFFFF;
if (ba_value(filter_to_search, (fast_hash(string_to_match, BETA) & mask1024))
== FALSE)
return FALSE;
if (ba_value(filter_to_search, (sdbm(string_to_match, BETA) & mask1024))
== FALSE)
return FALSE;
return TRUE;
}
// Inserts entries into the bloom-filter
void set_bloom_filter_value(bit *filter_to_set, unsigned char *string_to_hash)
{
int mask1024 = 0x3FFFFF;
ba_assign(filter_to_set,
((*(unsigned int *) string_to_hash) & mask1024), TRUE);
ba_assign(filter_to_set, (xor_hash(string_to_hash, BETA) & mask1024), TRUE);
ba_assign(filter_to_set, (fast_hash(string_to_hash, BETA) & mask1024), TRUE);
ba_assign(filter_to_set, (sdbm(string_to_hash, BETA) & mask1024), TRUE);
}
void insert_into_rb_table(struct rb_table* signature_tree, const char *virname,
const char *hexsig, unsigned short type)
{
struct cli_bloom_node* bloom_node =
(struct cli_bloom_node*) malloc(sizeof(struct cli_bloom_node));
bloom_node->signature_key = (char*) cli_hex2str(hexsig);
bloom_node->rb_key = sdbm(bloom_node->signature_key, BETA);
bloom_node->signature_length = strlen(hexsig) / 2;
bloom_node->signature_name = strdup(virname);
bloom_node->next = NULL;
// if item was already in the rb-tree
void *existing_node = NULL;
struct cli_bloom_node *str_existing_node = NULL;
if ((existing_node = rb_find(signature_tree, (void *) (bloom_node)))
!= NULL)
{
str_existing_node = (struct cli_bloom_node*) existing_node;
str_existing_node->next = bloom_node;
}
else
{
rb_probe(signature_tree, (void*) (bloom_node));
}
// Error checking for memory allocations omitted for now.
// return 0;
}
// returns 1 if virus is found
// 0 otherwise
int lookup_in_rb_table(const char* rest_of_file, int remaining_size,
char **virname, struct rb_table *signature_tree)
{
int tree_key = sdbm((unsigned char*) rest_of_file, BETA);
void *existing_node = rb_find(signature_tree, (void *) (&tree_key));
struct cli_bloom_node *iter_node = NULL;
if (existing_node)
{
iter_node = (struct cli_bloom_node*) existing_node;
while (iter_node)
{
if (remaining_size < iter_node->signature_length)
{
iter_node = iter_node->next;
continue;
}
if (!memcmp(rest_of_file, iter_node->signature_key,
iter_node->signature_length))
{
(*virname) = iter_node->signature_name;
return 1;
}
iter_node = iter_node->next;
}
}
// else there is a false positive from Bloom filter, gather stats
// else { }
return 0;
}
void setup_bloom_structures(bit** bloom_filter, struct rb_table** signature_tree)
{
// Add Error checking into this function.
(*bloom_filter) = ba_new(BLOOM_FILTER_SIZE);
init_bloom_filter(*bloom_filter);
// NULL allocator defaults to rb_allocator_default.
(*signature_tree) = rb_create(compare_ints, NULL, NULL);
}
void clean_bloom_structures(bit* bloom_filter, struct rb_table* signature_tree)
{
free(bloom_filter);
// Leaks memory for now, walk over the rb tree and deallocate all memory when
// the program terminates.
rb_destroy(signature_tree, NULL);
}
int cli_bloom_filter_scanbuff(const char *buffer, unsigned int length,
const char **virname, const struct cl_node *root,
unsigned short ftype)
{
bit* bloom_filter = root->bloom_filter;
struct rb_table* signature_tree = root->signature_tree;
// this is where we do the scanning.
char *file_iterator = (char *) buffer;
long int file_size_ctr = 0;
while (file_size_ctr < length)
{
// The two fastest hash functions are literally inlined.
if ((ba_value(bloom_filter, ((*(unsigned int *) file_iterator) &
0x3FFFFF)) == TRUE) &&
(ba_value(bloom_filter,(xor_hash(file_iterator, BETA) &
0x3FFFFF)) == TRUE))
{
if (is_possible_match(bloom_filter, file_iterator) == TRUE)
{
if (lookup_in_rb_table((const char*) file_iterator,
length - file_size_ctr, virname, signature_tree))
{
return CL_VIRUS;
}
}
}
file_size_ctr++;
file_iterator++;
}
}
_______________________________________________ http://lurker.clamav.net/list/clamav-devel.html