Hi all -

I recently compared ClamAV's extended BM (v 0.84rc1) to BloomAV for
regular signature matching. A tarred version of the slightly modified
libclamav (added matcher-bloom.c) is available at:

http://raconsoft.com/products/opensource/libclamav_bloom.tar.gz

The test file is a 80 MB file merged from Windows executables like
Word, Excel, Fifa, etc.. There are two versions for BloomAV and the
one I tarred is the former (1-byte at a time version). The following
results are from my AMD XP 3200 +.

Regular signatures (28,326) :
Extended Boyer-Moore: 11 MB/s  (1)
BloomAV 1-byte: 89 MB/s  (2)
BloomAV 4-bytes: 122 MB/s  (3)

where performance metrics were taken before and after:

(1)     (ret = cli_bm_scanbuff(buffer, length, virname, root, 0,
ftype, -1))    != CL_VIRUS)
(2-3)   (ret = cli_bloom_filter_scanbuff(buffer, length, virname,
root, ftype)) != CL_VIRUS)

The algorithm first checks the input file against a bloom filter, and
if there is a match, it takes a hash value, and looks up the hash in
an STL map or GNU libavl's red-black tree (the former performs ~10
MB/s better than the latter). False positive rates against the sample
executable for (2) and (3) are 0.25% and 4.3% respectively (this is
how often we hit the STL map). The tree can easily be replaced by a
data structure that performs better (big arrays maybe), I just found
maps to be convenient at a first glance.

BloomAV doesn't handle polymorphic viruses yet, and uses AC for
polymorphic signature matching (as does Boyer-Moore?). It also has a
BETA setting defining the shortest signature it will accept (currently
set to 7).

I'm not sure if clamav-devel@ accepts attachments, but I will try to
attach matcher-bloom.c for interested readers (easier than getting the
tar file). In terms of functionality, I tested BloomAV against 30 test
files I generated from ClamAV's database, and everything worked fine.

What do ClamAV developers think? I have more results if people are
interested. Would this algorithm be worth playing around with?

Thanks,

Ozgun.
#include "clamav.h"
#include "matcher-bloom.h"
#include "rb.h"

// optimum values for CPUs with 256 KB cache sizes
const int BETA = 7;
const int BLOOM_FILTER_SIZE = 4*1024*1024;

bit *ba_new(const elem_t nelems) 
{
  size_t howmany =  NELEM(nelems,(BITS_SZ));
   
  return ((bit *)calloc(howmany, sizeof(bit)));
} 

void ba_assign(bit arr[], elem_t elem, const boolean value) 
{
  if (value) 
    arr[elem / BITS_SZ] |= (1 << (elem % BITS_SZ));
  else 
    arr[elem / BITS_SZ] &= ~(1 << (elem % BITS_SZ));
}

inline
boolean ba_value(const bit arr[], const elem_t elem) 
{
  return( (arr[elem / BITS_SZ] & (1 << (elem % BITS_SZ))) ? TRUE:FALSE );
} 

void init_bloom_filter(bit *to_init)
{
  int i;
  for (i = 0; i < BLOOM_FILTER_SIZE; i++)
    ba_assign(to_init, i, FALSE);
}


// 4 very fast hash functions: Mask (literally inlined), xor, fasthash and
// sdbm. 

// len can NOT be shorter than 2+4 = 6 bytes. This puts a lower limit on how
// small Beta can be. There should be a check in this function to stop it from
// getting called with len < 6, but it's omitted for performance reasons (an
// extra branch instruction is very costly here).
inline
unsigned int xor_hash(unsigned char *buf, size_t len)
{
  unsigned int hash_val = 0;

  hash_val ^= *((unsigned int *) (buf));
  hash_val ^= *((unsigned int *) (buf+1));
  hash_val ^= *((unsigned int *) (buf+2));

  return hash_val;
}

// fast hash from hashlib.c
inline
unsigned long fast_hash(unsigned char *buf, size_t len)
{
  unsigned long hval = 0;
 
  unsigned char *bptr = buf;	/* start of buffer */
  unsigned char *bend = bptr + len;		/* beyond end of buffer */
  
  while (bptr < bend)
    hval = hval * 31UL + *bptr++;
 
  return hval;
}

// sdbm
inline
unsigned long sdbm(unsigned char *buf, size_t len)
{
  unsigned long hval = 0;
  int c;

  unsigned char *bptr = buf;	/* start of buffer */
  unsigned char *bend = bptr + len;		/* beyond end of buffer */
  
  while (bptr < bend)
  {
    c = *bptr++;
    hval = c + (hval << 6) + (hval << 16) - hval;
  }

  return hval;
}

boolean is_possible_match(bit *filter_to_search, unsigned char* string_to_match)
{  
  int mask1024 = 0x3FFFFF;

  if (ba_value(filter_to_search, (fast_hash(string_to_match, BETA) & mask1024))
      == FALSE)
    return FALSE;

  if (ba_value(filter_to_search, (sdbm(string_to_match, BETA) & mask1024))
      == FALSE)
    return FALSE;

  return TRUE;
}

// Inserts entries into the bloom-filter
void set_bloom_filter_value(bit *filter_to_set, unsigned char *string_to_hash)
{
  int mask1024 = 0x3FFFFF;

  ba_assign(filter_to_set,
	    ((*(unsigned int *) string_to_hash) & mask1024), TRUE);
  ba_assign(filter_to_set, (xor_hash(string_to_hash, BETA) & mask1024), TRUE);
  ba_assign(filter_to_set, (fast_hash(string_to_hash, BETA) & mask1024), TRUE);
  ba_assign(filter_to_set, (sdbm(string_to_hash, BETA) & mask1024), TRUE);
}


void insert_into_rb_table(struct rb_table* signature_tree, const char *virname, 
			 const char *hexsig, unsigned short type)
{
  struct cli_bloom_node* bloom_node = 
    (struct cli_bloom_node*) malloc(sizeof(struct cli_bloom_node));
  
  bloom_node->signature_key = (char*) cli_hex2str(hexsig);
  bloom_node->rb_key = sdbm(bloom_node->signature_key, BETA);
  bloom_node->signature_length = strlen(hexsig) / 2;
  bloom_node->signature_name = strdup(virname);
  bloom_node->next = NULL;

  // if item was already in the rb-tree
  void *existing_node = NULL;
  struct cli_bloom_node *str_existing_node = NULL;
  if ((existing_node = rb_find(signature_tree, (void *) (bloom_node))) 
      != NULL)
  {
    str_existing_node = (struct cli_bloom_node*) existing_node;
    str_existing_node->next = bloom_node;
  }
  else
  {
    rb_probe(signature_tree, (void*) (bloom_node));
  }

  // Error checking for memory allocations omitted for now.
  // return 0;
}

// returns 1 if virus is found
// 0 otherwise
int lookup_in_rb_table(const char* rest_of_file, int remaining_size,
		       char **virname, struct rb_table *signature_tree)
{
  int tree_key = sdbm((unsigned char*) rest_of_file, BETA);

  void *existing_node = rb_find(signature_tree, (void *) (&tree_key));
  struct cli_bloom_node *iter_node = NULL;

  if (existing_node)
  {
    iter_node = (struct cli_bloom_node*) existing_node;
    while (iter_node)
    {
      if (remaining_size < iter_node->signature_length)
      {
	iter_node = iter_node->next;
	continue;
      }

      if (!memcmp(rest_of_file, iter_node->signature_key, 
		  iter_node->signature_length))
      {
	(*virname) = iter_node->signature_name;
	return 1;
      }
      iter_node = iter_node->next;
    }

  }

  // else there is a false positive from Bloom filter, gather stats
  // else { }

  return 0;
}

void setup_bloom_structures(bit** bloom_filter, struct rb_table** signature_tree)
{
  // Add Error checking into this function.

  (*bloom_filter) = ba_new(BLOOM_FILTER_SIZE);
  init_bloom_filter(*bloom_filter);

  // NULL allocator defaults to rb_allocator_default.
  (*signature_tree) = rb_create(compare_ints, NULL, NULL);
}

void clean_bloom_structures(bit* bloom_filter, struct rb_table* signature_tree)
{
  free(bloom_filter);

  // Leaks memory for now, walk over the rb tree and deallocate all memory when
  // the program terminates.
  rb_destroy(signature_tree, NULL);
}

int cli_bloom_filter_scanbuff(const char *buffer, unsigned int length, 
			      const char **virname, const struct cl_node *root,
			      unsigned short ftype)
{
  bit* bloom_filter = root->bloom_filter;
  struct rb_table* signature_tree = root->signature_tree;

  // this is where we do the scanning.
  char *file_iterator = (char *) buffer;
  long int file_size_ctr = 0;

  while (file_size_ctr < length)
  {

    // The two fastest hash functions are literally inlined.
    if ((ba_value(bloom_filter, ((*(unsigned int *) file_iterator) &
				 0x3FFFFF)) == TRUE) &&
	(ba_value(bloom_filter,(xor_hash(file_iterator, BETA) &
				0x3FFFFF)) == TRUE))
    {
      if (is_possible_match(bloom_filter, file_iterator) == TRUE)
      {
	if (lookup_in_rb_table((const char*) file_iterator, 
			       length - file_size_ctr, virname, signature_tree))
	{
	  return CL_VIRUS;
	}
      }
    }

    file_size_ctr++;
    file_iterator++;
  }

}
_______________________________________________
http://lurker.clamav.net/list/clamav-devel.html

Reply via email to