-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 GiM wrote: > Gianluigi Tiesi in message '[Clamav-devel] Bloom Hash AV Matcher' wrote: >> the logic of the overall scan is: >> bloom first, if positive bm, >> then ac as normal flow >> >> the patch has not yet included gpl header, but the guy gave me the permission >> > > so, where's the patch? > > how much additional memory does it take? > sorry this list seems to remove non txt attachments
I don't known about memory requirements, I don't think it will affect too much I've attached the patch - -- Gianluigi Tiesi <[EMAIL PROTECTED]> EDP Project Leader Netfarm S.r.l. - http://www.netfarm.it/ Free Software: http://oss.netfarm.it/ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.1 (MingW32) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org iD8DBQFGXoDM3UE5cRfnO04RAr3hAKCjfk5eD+xK4XbUG7vIHlk5RGfc8ACeODzM 5fwerVRNXQMgz8GkXuXdgl4= =CiHB -----END PGP SIGNATURE-----
diff -Nur orig/libclamav/Makefile.am clamav-devel/libclamav/Makefile.am --- orig/libclamav/Makefile.am 2005-12-16 07:11:00.000000000 +0100 +++ clamav-devel/libclamav/Makefile.am 2006-01-10 07:05:29.000000000 +0100 @@ -30,6 +30,8 @@ matcher-ac.h \ matcher-bm.c \ matcher-bm.h \ + matcher-bloom.c \ + matcher-bloom.h \ matcher.c \ matcher.h \ md5.c \ diff -Nur orig/libclamav/clamav.h clamav-devel/libclamav/clamav.h --- orig/libclamav/clamav.h 2005-12-13 08:22:04.000000000 +0100 +++ clamav-devel/libclamav/clamav.h 2006-01-10 07:04:58.000000000 +0100 @@ -133,6 +133,15 @@ struct cli_meta_node *next; }; +/* Additions for Bloom-AV */ +typedef unsigned char bit; + +struct cli_bloom_node { + char *signature_key, *signature_name; + unsigned int signature_length; + struct cli_bloom_node *next; +}; + struct cli_matcher { unsigned int maxpatlen; /* maximal length of pattern in db */ unsigned short ac_only; @@ -145,6 +154,10 @@ unsigned int ac_depth; struct cli_ac_node *ac_root, **ac_nodetable; unsigned int ac_partsigs, ac_nodes; + + /* BloomAV additions */ + bit* bloom_filter; + struct cli_bloom_node** sig_hashtable; }; struct cl_engine { diff -Nur orig/libclamav/matcher-bloom.c clamav-devel/libclamav/matcher-bloom.c --- orig/libclamav/matcher-bloom.c 1970-01-01 01:00:00.000000000 +0100 +++ clamav-devel/libclamav/matcher-bloom.c 2006-01-10 07:04:58.000000000 +0100 @@ -0,0 +1,246 @@ +#if HAVE_CONFIG_H +#include "clamav-config.h" +#endif + +#include "clamav.h" +#include "matcher-bloom.h" + +/* optimum values for CPUs with 256 KB cache sizes */ +const int BETA = 7; +const int BLOOM_FILTER_SIZE = 4*1024*1024; + +/* 1.5 * 4 =~ 6 MB memory overhead */ +const int HASHTABLE_PRIME = 1572869; + +bit *ba_new(const elem_t nelems) +{ + size_t howmany = NELEM(nelems,(BITS_SZ)); + return ((bit *) cli_calloc(howmany, sizeof(bit))); +} + +void ba_assign(bit arr[], elem_t elem, const bool value) +{ + if (value) + arr[elem / BITS_SZ] |= (1 << (elem % BITS_SZ)); + else + arr[elem / BITS_SZ] &= ~(1 << (elem % BITS_SZ)); +} + +inline bool ba_value(const bit arr[], const elem_t elem) +{ + return ((arr[elem / BITS_SZ] & (1 << (elem % BITS_SZ))) ? TRUE : FALSE); +} + +void init_bloom_filter(bit *to_init) +{ + int i; + for (i = 0; i < BLOOM_FILTER_SIZE; i++) + ba_assign(to_init, i, FALSE); +} + + +/* 4 very fast hash functions: Mask (literally inlined), xor, fasthash and + * sdbm. + * + * len can NOT be shorter than 2+4 = 6 bytes. This puts a lower limit on how + * small Beta can be. There should be a check in this function to stop it from + * getting called with len < 6, but it's omitted for performance reasons (an + * extra branch instruction is very costly here). + */ + +inline unsigned int xor_hash(unsigned char *buf, size_t len) +{ + unsigned int hash_val = 0; + hash_val ^= *((unsigned int *) (buf)); + hash_val ^= *((unsigned int *) (buf+1)); + hash_val ^= *((unsigned int *) (buf+2)); + return hash_val; +} + +/* fast hash from hashlib.c */ +inline unsigned long fast_hash(unsigned char *buf, size_t len) +{ + unsigned long hval = 0; + + unsigned char *bptr = buf; /* start of buffer */ + unsigned char *bend = bptr + len; /* beyond end of buffer */ + + while (bptr < bend) + hval = hval * 31UL + *bptr++; + + return hval; +} + +/* sdbm */ +inline unsigned long sdbm(unsigned char *buf, size_t len) +{ + unsigned long hval = 0; + int c; + + unsigned char *bptr = buf; /* start of buffer */ + unsigned char *bend = bptr + len; /* beyond end of buffer */ + + while (bptr < bend) + { + c = *bptr++; + hval = c + (hval << 6) + (hval << 16) - hval; + } + + return hval; +} + +bool is_possible_match(bit *filter_to_search, unsigned char* string_to_match) +{ + int mask1024 = 0x3FFFFF; + + if (ba_value(filter_to_search, (fast_hash(string_to_match, BETA) & mask1024)) == FALSE) + return FALSE; + + if (ba_value(filter_to_search, (sdbm(string_to_match, BETA) & mask1024)) == FALSE) + return FALSE; + + return TRUE; +} + +/* Inserts entries into the bloom-filter */ +void set_bloom_filter_value(bit *filter_to_set, unsigned char *string_to_hash) +{ + int mask1024 = 0x3FFFFF; + + ba_assign(filter_to_set, ((*(unsigned int *) string_to_hash) & mask1024), TRUE); + ba_assign(filter_to_set, (xor_hash(string_to_hash, BETA) & mask1024), TRUE); + ba_assign(filter_to_set, (fast_hash(string_to_hash, BETA) & mask1024), TRUE); + ba_assign(filter_to_set, (sdbm(string_to_hash, BETA) & mask1024), TRUE); +} + +int insert_into_hash_table(struct cli_bloom_node* hash_table[], const char *virname, const char *hexsig, unsigned short type) +{ + struct cli_bloom_node* bloom_node = NULL; + int hash_key = 0; + + if ((strlen(hexsig) / 2) < BETA) + { + cli_dbgmsg("BloomAV can't scan for very short virus sigs! Skipping."); + return 1; + } + + bloom_node = (struct cli_bloom_node *) cli_malloc(sizeof(struct cli_bloom_node)); + + bloom_node->signature_key = (char *) cli_hex2str(hexsig); + bloom_node->signature_length = strlen(hexsig) / 2; + bloom_node->signature_name = strdup(virname); + bloom_node->next = NULL; + + hash_key = sdbm(bloom_node->signature_key, BETA) % HASHTABLE_PRIME; + + /* if item was already in the hash-table, do a list head insertion */ + if (hash_table[hash_key]) + { + struct cli_bloom_node* second_node = hash_table[hash_key]->next; + hash_table[hash_key]->next = bloom_node; + bloom_node->next = second_node; + } + else + hash_table[hash_key] = bloom_node; + + /* Error checking for memory allocations omitted for now. */ + return 0; +} + + +/* returns 1 if virus is found + * 0 otherwise + */ +int lookup_in_hash_table(const char* rest_of_file, int remaining_size, const char **virname, struct cli_bloom_node* hash_table[]) +{ + int hash_key = sdbm((unsigned char*) rest_of_file, BETA) % HASHTABLE_PRIME; + struct cli_bloom_node* table_iter = hash_table[hash_key]; + + while (table_iter) + { + if (remaining_size < table_iter->signature_length) + { + table_iter = table_iter->next; + continue; + } + + if (!memcmp(rest_of_file, table_iter->signature_key, table_iter->signature_length)) + { + *virname = (char *) table_iter->signature_name; + return 1; + } + + table_iter = table_iter->next; + } + + /* else there is a false positive from Bloom filter, gather stats */ + return 0; +} + +int cli_bloom_init(struct cli_matcher *root) +{ + /* Add Error checking into this function. */ + root->bloom_filter = ba_new(BLOOM_FILTER_SIZE); + init_bloom_filter(root->bloom_filter); + root->sig_hashtable = (struct cli_bloom_node **) cli_calloc(HASHTABLE_PRIME, sizeof(struct regular_signature *)); + return 0; +} + +void clean_bloom_structures(bit* bloom_filter, struct cli_bloom_node* hash_table[]) +{ + int i = 0; + free(bloom_filter); + + for (i = 0; i < HASHTABLE_PRIME; i++) + { + if (hash_table[i]) + { + struct cli_bloom_node* list_head = hash_table[i]; + while (list_head) + { + struct cli_bloom_node* sig_to_delete = list_head; + list_head = list_head->next; + free(sig_to_delete->signature_key); + free(sig_to_delete->signature_name); + free(sig_to_delete); + } + } + } + + free(hash_table); +} + +int cli_bloom_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, unsigned long int offset, unsigned short ftype) +{ + bit* bloom_filter = root->bloom_filter; + struct cli_bloom_node** sig_hashtable = root->sig_hashtable; + + /* this is where we do the scanning. */ + char *file_iterator = (char *) buffer; + long int file_size_ctr = 0; + + if (offset != 0) return CL_VIRUS; /* not yet supported pass to bm as false positive */ + while (file_size_ctr < length) + { + + /* The fastest hash function is literally inlined. */ + if ((ba_value(bloom_filter, ((*(unsigned int *) file_iterator) & 0x3FFFFF)) == TRUE) && + (ba_value(bloom_filter, (xor_hash(file_iterator, BETA) & 0x3FFFFF)) == TRUE)) + { + if (is_possible_match(bloom_filter, file_iterator) == TRUE) + { + if (lookup_in_hash_table((const char*) file_iterator, length - file_size_ctr, virname, sig_hashtable)) + { + /* cli_errmsg("Bloom filter found %s\n", *virname); */ + return CL_VIRUS; + } + } + } + + file_size_ctr++; + file_iterator++; + } + + /* cli_dbgmsg("Bloom filter says CLEAN\n"); */ + return 0; +} diff -Nur orig/libclamav/matcher-bloom.h clamav-devel/libclamav/matcher-bloom.h --- orig/libclamav/matcher-bloom.h 1970-01-01 01:00:00.000000000 +0100 +++ clamav-devel/libclamav/matcher-bloom.h 2006-01-10 07:04:58.000000000 +0100 @@ -0,0 +1,52 @@ +#ifndef __MATCHER_BLOOM_H +#define __MATCHER_BLOOM_H + +#if HAVE_CONFIG_H +#include "clamav-config.h" +#endif + +#define NELEM(N,ELEMPER) ((N + (ELEMPER) - 1) / (ELEMPER)) +#define BITS_SZ 8 + +#ifndef __cplusplus +/* required for AIX and Tru64 */ +#ifdef TRUE +#undef TRUE +#endif +#ifdef FALSE +#undef FALSE +#endif + +typedef enum { FALSE = 0, TRUE = 1 } bool; +#endif + +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/time.h> +#include <unistd.h> +#include <fcntl.h> +#include <time.h> +#include <stdlib.h> +#include <stddef.h> + +#include <clamav.h> + +typedef size_t elem_t; + +bit *ba_new(const elem_t nelems); +void ba_assign(bit arr[], elem_t elem, const bool value); + +/* functions for our bloom filter */ +void init_bloom_filter(bit *to_init); +void set_bloom_filter_value(bit *filter_to_set,unsigned char *string_to_hash); +bool is_possible_match(bit *filter_to_search, unsigned char* string_to_match); + +int cli_bloom_init(struct cli_matcher *root); +int insert_into_hash_table(struct cli_bloom_node* hash_table[], const char *virname, const char *hexsig, unsigned short type); +int cli_bloom_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, unsigned long int offset, unsigned short ftype); + +#endif /* __MATCHER_BLOOM_H */ diff -Nur orig/libclamav/matcher.c clamav-devel/libclamav/matcher.c --- orig/libclamav/matcher.c 2006-01-07 04:27:08.000000000 +0100 +++ clamav-devel/libclamav/matcher.c 2006-01-10 07:04:58.000000000 +0100 @@ -30,6 +30,7 @@ #include "others.h" #include "matcher-ac.h" #include "matcher-bm.h" +#include "matcher-bloom.h" #include "md5.h" #include "filetypes.h" #include "matcher.h" @@ -88,8 +89,10 @@ return CL_EMEM; } - if(troot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, troot, 0, ftype, -1)) != CL_VIRUS) - ret = cli_ac_scanbuff(buffer, length, virname, troot, partcnt, 0, 0, partoff, ftype, -1, NULL); + if(troot->ac_only || + (((ret = cli_bloom_scanbuff(buffer, length, virname, troot, 0, ftype)) != CL_VIRUS) || + ((ret = cli_bm_scanbuff(buffer, length, virname, troot, 0, ftype, -1)) != CL_VIRUS))) + ret = cli_ac_scanbuff(buffer, length, virname, troot, partcnt, 0, 0, partoff, ftype, -1, NULL); free(partcnt); free(partoff); @@ -109,8 +112,10 @@ return CL_EMEM; } - if(groot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, groot, 0, ftype, -1)) != CL_VIRUS) - ret = cli_ac_scanbuff(buffer, length, virname, groot, partcnt, 0, 0, partoff, ftype, -1, NULL); + if(troot->ac_only || + (((ret = cli_bloom_scanbuff(buffer, length, virname, troot, 0, ftype)) != CL_VIRUS) || + ((ret = cli_bm_scanbuff(buffer, length, virname, groot, 0, ftype, -1)) != CL_VIRUS))) + ret = cli_ac_scanbuff(buffer, length, virname, groot, partcnt, 0, 0, partoff, ftype, -1, NULL); free(partcnt); free(partoff); @@ -371,8 +376,10 @@ length -= SCANBUFF - bytes; if(troot) { - if(troot->ac_only || (ret = cli_bm_scanbuff(pt, length, virname, troot, offset, ftype, desc)) != CL_VIRUS) - ret = cli_ac_scanbuff(pt, length, virname, troot, tpartcnt, otfrec, offset, tpartoff, ftype, desc, ftoffset); + if(troot->ac_only || + (((ret = cli_bloom_scanbuff(pt, length, virname, troot, offset, ftype)) != CL_VIRUS) || + ((ret = cli_bm_scanbuff(pt, length, virname, troot, offset, ftype, desc)) != CL_VIRUS))) + ret = cli_ac_scanbuff(pt, length, virname, troot, tpartcnt, otfrec, offset, tpartoff, ftype, desc, ftoffset); if(ret == CL_VIRUS) { free(buffer); @@ -389,8 +396,10 @@ } } - if(groot->ac_only || (ret = cli_bm_scanbuff(pt, length, virname, groot, offset, ftype, desc)) != CL_VIRUS) - ret = cli_ac_scanbuff(pt, length, virname, groot, gpartcnt, otfrec, offset, gpartoff, ftype, desc, ftoffset); + if(groot->ac_only || + (((ret = cli_bloom_scanbuff(pt, length, virname, groot, offset, ftype)) != CL_VIRUS) || + ((ret = cli_bm_scanbuff(pt, length, virname, groot, offset, ftype, desc)) != CL_VIRUS))) + ret = cli_ac_scanbuff(pt, length, virname, groot, gpartcnt, otfrec, offset, gpartoff, ftype, desc, ftoffset); if(ret == CL_VIRUS) { free(buffer); diff -Nur orig/libclamav/readdb.c clamav-devel/libclamav/readdb.c --- orig/libclamav/readdb.c 2005-12-13 08:22:04.000000000 +0100 +++ clamav-devel/libclamav/readdb.c 2006-01-10 07:04:58.000000000 +0100 @@ -36,6 +36,7 @@ #include "strings.h" #include "matcher-ac.h" #include "matcher-bm.h" +#include "matcher-bloom.h" #include "others.h" #include "str.h" #include "defaults.h" @@ -60,6 +61,28 @@ /* TODO: clean up the code */ +/** + * Helper function for BloomAV + */ +int cli_bloom_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, unsigned short type) +{ + char *binary_sig = NULL; + + if (strpbrk(hexsig, "?(*{")) + { + cli_errmsg("BloomAV won't scan poly signatures! Skipping."); + return 0; + } + + binary_sig = cli_hex2str(hexsig); + if (!binary_sig) return CL_EMEM; + + set_bloom_filter_value(root->bloom_filter, binary_sig); + insert_into_hash_table(root->sig_hashtable, virname, hexsig, type); + free(binary_sig); + return 0; +} + static int cli_ac_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, int sigid, int parts, int partno, unsigned short type, unsigned int mindist, unsigned int maxdist, char *offset, unsigned short target) { struct cli_ac_patt *new; @@ -379,7 +402,15 @@ return ret; } - } else { + } + /* While adding signatures to BM, also add to BloomAV */ + else + { + if (ret = cli_bloom_addsig(root, virname, hexsig, 0)) + { + cli_errmsg("cli_parse_add(): Problem adding signature (3).\n"); + return ret; + } bm_new = (struct cli_bm_patt *) calloc(1, sizeof(struct cli_bm_patt)); if(!bm_new) return CL_EMEM; @@ -490,10 +521,19 @@ cli_errmsg("Can't initialise BM pattern matcher\n"); return ret; } - } - } - } + /* Adding initialization code for BloomAV */ + /* Yup, we should check separately for bloom filter and hash-table */ + if (!root->bloom_filter) { + cli_dbgmsg("Initializing Bloom filter and signature table of root[%d]\n", i); + if((ret = cli_bloom_init(root))) { + cli_errmsg("Can't initialise Bloom pattern matcher\n"); + return ret; + } + } + } + } + } return 0; } @@ -575,7 +615,14 @@ cl_free(*engine); return ret; } - +#if 0 + /* Adding initialization code for BloomAV */ + /* Yup, we should check separately for bloom filter and rb-tree */ + if (!(*(*engine)->root)->bloom_filter) { + cli_dbgmsg("Initializing Bloom filter and signature table\n"); + cli_bloom_init((*(*engine)->root)); + } +#endif while(fgets(buffer, FILEBUFF, fd)) { line++;
_______________________________________________ http://lurker.clamav.net/list/clamav-devel.html Please submit your patches to our Bugzilla: http://bugs.clamav.net