-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

GiM wrote:
> Gianluigi Tiesi in message '[Clamav-devel] Bloom Hash AV Matcher' wrote:
>> the logic of the overall scan is:
>> bloom first, if positive bm,
>> then ac as normal flow
>>
>> the patch has not yet included gpl header, but the guy gave me the permission
>>
> 
> so, where's the patch?
> 
> how much additional memory does it take?
> 
sorry this list seems to remove non txt attachments

I don't known about memory requirements, I don't think it will
affect too much

I've attached the patch

- --
Gianluigi Tiesi <[EMAIL PROTECTED]>
EDP Project Leader
Netfarm S.r.l. - http://www.netfarm.it/
Free Software: http://oss.netfarm.it/

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.1 (MingW32)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQFGXoDM3UE5cRfnO04RAr3hAKCjfk5eD+xK4XbUG7vIHlk5RGfc8ACeODzM
5fwerVRNXQMgz8GkXuXdgl4=
=CiHB
-----END PGP SIGNATURE-----
diff -Nur orig/libclamav/Makefile.am clamav-devel/libclamav/Makefile.am
--- orig/libclamav/Makefile.am  2005-12-16 07:11:00.000000000 +0100
+++ clamav-devel/libclamav/Makefile.am  2006-01-10 07:05:29.000000000 +0100
@@ -30,6 +30,8 @@
         matcher-ac.h \
         matcher-bm.c \
         matcher-bm.h \
+               matcher-bloom.c \
+               matcher-bloom.h \
         matcher.c \
         matcher.h \
         md5.c \
diff -Nur orig/libclamav/clamav.h clamav-devel/libclamav/clamav.h
--- orig/libclamav/clamav.h     2005-12-13 08:22:04.000000000 +0100
+++ clamav-devel/libclamav/clamav.h     2006-01-10 07:04:58.000000000 +0100
@@ -133,6 +133,15 @@
     struct cli_meta_node *next;
 };
 
+/* Additions for Bloom-AV */
+typedef unsigned char bit;
+
+struct cli_bloom_node {
+  char *signature_key, *signature_name;
+  unsigned int signature_length;
+  struct cli_bloom_node *next;
+};
+
 struct cli_matcher {
     unsigned int maxpatlen; /* maximal length of pattern in db */
     unsigned short ac_only;
@@ -145,6 +154,10 @@
     unsigned int ac_depth;
     struct cli_ac_node *ac_root, **ac_nodetable;
     unsigned int ac_partsigs, ac_nodes;
+
+    /* BloomAV additions */
+    bit* bloom_filter;
+    struct cli_bloom_node** sig_hashtable;
 };
 
 struct cl_engine {
diff -Nur orig/libclamav/matcher-bloom.c clamav-devel/libclamav/matcher-bloom.c
--- orig/libclamav/matcher-bloom.c      1970-01-01 01:00:00.000000000 +0100
+++ clamav-devel/libclamav/matcher-bloom.c      2006-01-10 07:04:58.000000000 
+0100
@@ -0,0 +1,246 @@
+#if HAVE_CONFIG_H
+#include "clamav-config.h"
+#endif
+
+#include "clamav.h"
+#include "matcher-bloom.h"
+
+/* optimum values for CPUs with 256 KB cache sizes */
+const int BETA = 7;
+const int BLOOM_FILTER_SIZE = 4*1024*1024;
+
+/* 1.5 * 4 =~ 6 MB memory overhead */
+const int HASHTABLE_PRIME = 1572869;
+
+bit *ba_new(const elem_t nelems) 
+{
+    size_t howmany =  NELEM(nelems,(BITS_SZ));
+    return ((bit *) cli_calloc(howmany, sizeof(bit)));
+} 
+
+void ba_assign(bit arr[], elem_t elem, const bool value) 
+{
+    if (value) 
+        arr[elem / BITS_SZ] |= (1 << (elem % BITS_SZ));
+    else 
+        arr[elem / BITS_SZ] &= ~(1 << (elem % BITS_SZ));
+}
+
+inline bool ba_value(const bit arr[], const elem_t elem) 
+{
+    return ((arr[elem / BITS_SZ] & (1 << (elem % BITS_SZ))) ? TRUE : FALSE);
+} 
+
+void init_bloom_filter(bit *to_init)
+{
+    int i;
+    for (i = 0; i < BLOOM_FILTER_SIZE; i++)
+        ba_assign(to_init, i, FALSE);
+}
+
+
+/* 4 very fast hash functions: Mask (literally inlined), xor, fasthash and
+ * sdbm. 
+ *
+ * len can NOT be shorter than 2+4 = 6 bytes. This puts a lower limit on how
+ * small Beta can be. There should be a check in this function to stop it from
+ * getting called with len < 6, but it's omitted for performance reasons (an
+ * extra branch instruction is very costly here).
+ */
+
+inline unsigned int xor_hash(unsigned char *buf, size_t len)
+{
+    unsigned int hash_val = 0;
+    hash_val ^= *((unsigned int *) (buf));
+    hash_val ^= *((unsigned int *) (buf+1));
+    hash_val ^= *((unsigned int *) (buf+2));
+    return hash_val;
+}
+
+/* fast hash from hashlib.c */
+inline unsigned long fast_hash(unsigned char *buf, size_t len)
+{
+    unsigned long hval = 0;
+
+    unsigned char *bptr = buf;          /* start of buffer */
+    unsigned char *bend = bptr + len;   /* beyond end of buffer */
+
+    while (bptr < bend)
+        hval = hval * 31UL + *bptr++;
+
+    return hval;
+}
+
+/* sdbm */
+inline unsigned long sdbm(unsigned char *buf, size_t len)
+{
+    unsigned long hval = 0;
+    int c;
+
+    unsigned char *bptr = buf;          /* start of buffer */
+    unsigned char *bend = bptr + len;   /* beyond end of buffer */
+
+    while (bptr < bend)
+    {
+        c = *bptr++;
+        hval = c + (hval << 6) + (hval << 16) - hval;
+    }
+
+    return hval;
+}
+
+bool is_possible_match(bit *filter_to_search, unsigned char* string_to_match)
+{  
+    int mask1024 = 0x3FFFFF;
+
+    if (ba_value(filter_to_search, (fast_hash(string_to_match, BETA) & 
mask1024)) == FALSE)
+        return FALSE;
+
+    if (ba_value(filter_to_search, (sdbm(string_to_match, BETA) & mask1024)) 
== FALSE)
+        return FALSE;
+
+    return TRUE;
+}
+
+/* Inserts entries into the bloom-filter */
+void set_bloom_filter_value(bit *filter_to_set, unsigned char *string_to_hash)
+{
+    int mask1024 = 0x3FFFFF;
+
+    ba_assign(filter_to_set, ((*(unsigned int *) string_to_hash) & mask1024), 
TRUE);
+    ba_assign(filter_to_set, (xor_hash(string_to_hash, BETA) & mask1024), 
TRUE);
+    ba_assign(filter_to_set, (fast_hash(string_to_hash, BETA) & mask1024), 
TRUE);
+    ba_assign(filter_to_set, (sdbm(string_to_hash, BETA) & mask1024), TRUE);
+}
+
+int insert_into_hash_table(struct cli_bloom_node* hash_table[], const char 
*virname, const char *hexsig, unsigned short type)
+{
+    struct cli_bloom_node* bloom_node = NULL;
+    int hash_key = 0;
+
+    if ((strlen(hexsig) / 2) < BETA)
+    {
+        cli_dbgmsg("BloomAV can't scan for very short virus sigs! Skipping.");
+        return 1;
+    }
+
+    bloom_node = (struct cli_bloom_node *) cli_malloc(sizeof(struct 
cli_bloom_node));
+
+    bloom_node->signature_key = (char *) cli_hex2str(hexsig);
+    bloom_node->signature_length = strlen(hexsig) / 2;
+    bloom_node->signature_name = strdup(virname);
+    bloom_node->next = NULL;
+
+    hash_key = sdbm(bloom_node->signature_key, BETA) % HASHTABLE_PRIME;
+
+    /* if item was already in the hash-table, do a list head insertion */
+    if (hash_table[hash_key])
+    {
+        struct cli_bloom_node* second_node = hash_table[hash_key]->next;
+        hash_table[hash_key]->next = bloom_node;
+        bloom_node->next = second_node;
+    }
+    else
+        hash_table[hash_key] = bloom_node;
+
+    /* Error checking for memory allocations omitted for now. */
+    return 0;
+}
+
+
+/* returns 1 if virus is found
+ * 0 otherwise
+ */
+int lookup_in_hash_table(const char* rest_of_file, int remaining_size, const 
char **virname, struct cli_bloom_node* hash_table[])
+{
+    int hash_key = sdbm((unsigned char*) rest_of_file, BETA) % HASHTABLE_PRIME;
+    struct cli_bloom_node* table_iter = hash_table[hash_key];
+
+    while (table_iter)
+    {
+        if (remaining_size < table_iter->signature_length)
+        {
+            table_iter = table_iter->next;
+            continue;
+        }
+
+        if (!memcmp(rest_of_file, table_iter->signature_key, 
table_iter->signature_length))
+        {
+            *virname = (char *) table_iter->signature_name;
+            return 1;
+        }
+
+        table_iter = table_iter->next;
+    }
+
+    /* else there is a false positive from Bloom filter, gather stats */
+    return 0;
+}
+
+int cli_bloom_init(struct cli_matcher *root)
+{
+    /* Add Error checking into this function. */
+    root->bloom_filter = ba_new(BLOOM_FILTER_SIZE);
+    init_bloom_filter(root->bloom_filter);
+    root->sig_hashtable = (struct cli_bloom_node **) 
cli_calloc(HASHTABLE_PRIME, sizeof(struct regular_signature *));
+    return 0;
+}
+
+void clean_bloom_structures(bit* bloom_filter, struct cli_bloom_node* 
hash_table[]) 
+{
+    int i = 0;
+    free(bloom_filter);
+
+    for (i = 0; i < HASHTABLE_PRIME; i++)
+    {
+        if (hash_table[i])
+        {
+            struct cli_bloom_node* list_head = hash_table[i];
+            while (list_head)
+            {
+                struct cli_bloom_node* sig_to_delete = list_head;
+                list_head = list_head->next;
+                free(sig_to_delete->signature_key);
+                free(sig_to_delete->signature_name);
+                free(sig_to_delete);
+            }
+        }
+    }
+
+    free(hash_table);
+}
+
+int cli_bloom_scanbuff(const char *buffer, unsigned int length, const char 
**virname, const struct cli_matcher *root, unsigned long int offset, unsigned 
short ftype)
+{
+    bit* bloom_filter = root->bloom_filter;
+    struct cli_bloom_node** sig_hashtable = root->sig_hashtable;
+
+    /* this is where we do the scanning. */
+    char *file_iterator = (char *) buffer;
+    long int file_size_ctr = 0;
+
+    if (offset != 0) return CL_VIRUS; /* not yet supported pass to bm as false 
positive */
+    while (file_size_ctr < length)
+    {
+
+        /* The fastest hash function is literally inlined. */
+        if ((ba_value(bloom_filter, ((*(unsigned int *) file_iterator) & 
0x3FFFFF)) == TRUE) &&
+            (ba_value(bloom_filter, (xor_hash(file_iterator, BETA) & 
0x3FFFFF)) == TRUE))
+        {
+            if (is_possible_match(bloom_filter, file_iterator) == TRUE)
+            {
+                if (lookup_in_hash_table((const char*) file_iterator, length - 
file_size_ctr, virname, sig_hashtable))
+                {
+                    /* cli_errmsg("Bloom filter found %s\n", *virname); */
+                    return CL_VIRUS;
+                }
+            }
+        }
+
+        file_size_ctr++;
+        file_iterator++;
+    }
+
+    /* cli_dbgmsg("Bloom filter says CLEAN\n"); */
+    return 0;
+}
diff -Nur orig/libclamav/matcher-bloom.h clamav-devel/libclamav/matcher-bloom.h
--- orig/libclamav/matcher-bloom.h      1970-01-01 01:00:00.000000000 +0100
+++ clamav-devel/libclamav/matcher-bloom.h      2006-01-10 07:04:58.000000000 
+0100
@@ -0,0 +1,52 @@
+#ifndef __MATCHER_BLOOM_H
+#define __MATCHER_BLOOM_H
+
+#if HAVE_CONFIG_H
+#include "clamav-config.h"
+#endif
+
+#define NELEM(N,ELEMPER) ((N + (ELEMPER) - 1) / (ELEMPER))
+#define BITS_SZ 8
+
+#ifndef __cplusplus
+/* required for AIX and Tru64 */
+#ifdef TRUE
+#undef TRUE
+#endif
+#ifdef FALSE
+#undef FALSE
+#endif
+
+typedef enum    { FALSE = 0, TRUE = 1 } bool;
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+#include <clamav.h>
+
+typedef size_t elem_t;
+
+bit *ba_new(const elem_t nelems);
+void ba_assign(bit arr[], elem_t elem, const bool value);
+
+/* functions for our bloom filter */
+void init_bloom_filter(bit *to_init);
+void set_bloom_filter_value(bit *filter_to_set,unsigned char *string_to_hash);
+bool is_possible_match(bit *filter_to_search, unsigned char* string_to_match);
+
+int cli_bloom_init(struct cli_matcher *root);
+int insert_into_hash_table(struct cli_bloom_node* hash_table[], const char 
*virname, const char *hexsig, unsigned short type);
+int cli_bloom_scanbuff(const char *buffer, unsigned int length, const char 
**virname, const struct cli_matcher *root, unsigned long int offset, unsigned 
short ftype);
+
+#endif  /* __MATCHER_BLOOM_H */
diff -Nur orig/libclamav/matcher.c clamav-devel/libclamav/matcher.c
--- orig/libclamav/matcher.c    2006-01-07 04:27:08.000000000 +0100
+++ clamav-devel/libclamav/matcher.c    2006-01-10 07:04:58.000000000 +0100
@@ -30,6 +30,7 @@
 #include "others.h"
 #include "matcher-ac.h"
 #include "matcher-bm.h"
+#include "matcher-bloom.h"
 #include "md5.h"
 #include "filetypes.h"
 #include "matcher.h"
@@ -88,8 +89,10 @@
            return CL_EMEM;
        }
 
-       if(troot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, 
troot, 0, ftype, -1)) != CL_VIRUS)
-           ret = cli_ac_scanbuff(buffer, length, virname, troot, partcnt, 0, 
0, partoff, ftype, -1, NULL);
+       if(troot->ac_only ||
+               (((ret = cli_bloom_scanbuff(buffer, length, virname, troot, 0, 
ftype)) != CL_VIRUS) ||
+               ((ret = cli_bm_scanbuff(buffer, length, virname, troot, 0, 
ftype, -1)) != CL_VIRUS)))
+                       ret = cli_ac_scanbuff(buffer, length, virname, troot, 
partcnt, 0, 0, partoff, ftype, -1, NULL);
 
        free(partcnt);
        free(partoff);
@@ -109,8 +112,10 @@
        return CL_EMEM;
     }
 
-    if(groot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, 
groot, 0, ftype, -1)) != CL_VIRUS)
-       ret = cli_ac_scanbuff(buffer, length, virname, groot, partcnt, 0, 0, 
partoff, ftype, -1, NULL);
+    if(troot->ac_only ||
+               (((ret = cli_bloom_scanbuff(buffer, length, virname, troot, 0, 
ftype)) != CL_VIRUS) ||
+               ((ret = cli_bm_scanbuff(buffer, length, virname, groot, 0, 
ftype, -1)) != CL_VIRUS)))
+                       ret = cli_ac_scanbuff(buffer, length, virname, groot, 
partcnt, 0, 0, partoff, ftype, -1, NULL);
 
     free(partcnt);
     free(partoff);
@@ -371,8 +376,10 @@
            length -= SCANBUFF - bytes;
 
        if(troot) {
-           if(troot->ac_only || (ret = cli_bm_scanbuff(pt, length, virname, 
troot, offset, ftype, desc)) != CL_VIRUS)
-               ret = cli_ac_scanbuff(pt, length, virname, troot, tpartcnt, 
otfrec, offset, tpartoff, ftype, desc, ftoffset);
+        if(troot->ac_only ||
+               (((ret = cli_bloom_scanbuff(pt, length, virname, troot, offset, 
ftype)) != CL_VIRUS) ||
+               ((ret = cli_bm_scanbuff(pt, length, virname, troot, offset, 
ftype, desc)) != CL_VIRUS)))
+                       ret = cli_ac_scanbuff(pt, length, virname, troot, 
tpartcnt, otfrec, offset, tpartoff, ftype, desc, ftoffset);
 
            if(ret == CL_VIRUS) {
                free(buffer);
@@ -389,8 +396,10 @@
            }
        }
 
-       if(groot->ac_only || (ret = cli_bm_scanbuff(pt, length, virname, groot, 
offset, ftype, desc)) != CL_VIRUS)
-           ret = cli_ac_scanbuff(pt, length, virname, groot, gpartcnt, otfrec, 
offset, gpartoff, ftype, desc, ftoffset);
+    if(groot->ac_only ||
+               (((ret = cli_bloom_scanbuff(pt, length, virname, groot, offset, 
ftype)) != CL_VIRUS) ||
+               ((ret = cli_bm_scanbuff(pt, length, virname, groot, offset, 
ftype, desc)) != CL_VIRUS)))
+                       ret = cli_ac_scanbuff(pt, length, virname, groot, 
gpartcnt, otfrec, offset, gpartoff, ftype, desc, ftoffset);
 
        if(ret == CL_VIRUS) {
            free(buffer);
diff -Nur orig/libclamav/readdb.c clamav-devel/libclamav/readdb.c
--- orig/libclamav/readdb.c     2005-12-13 08:22:04.000000000 +0100
+++ clamav-devel/libclamav/readdb.c     2006-01-10 07:04:58.000000000 +0100
@@ -36,6 +36,7 @@
 #include "strings.h"
 #include "matcher-ac.h"
 #include "matcher-bm.h"
+#include "matcher-bloom.h"
 #include "others.h"
 #include "str.h"
 #include "defaults.h"
@@ -60,6 +61,28 @@
 
 /* TODO: clean up the code */
 
+/**
+ *  Helper function for BloomAV
+ */
+int cli_bloom_addsig(struct cli_matcher *root, const char *virname,  const 
char *hexsig, unsigned short type)
+{
+    char *binary_sig = NULL;
+
+    if (strpbrk(hexsig, "?(*{"))
+    {
+        cli_errmsg("BloomAV won't scan poly signatures! Skipping.");
+        return 0;
+    }
+
+    binary_sig = cli_hex2str(hexsig);
+    if (!binary_sig) return CL_EMEM;
+
+    set_bloom_filter_value(root->bloom_filter, binary_sig);
+    insert_into_hash_table(root->sig_hashtable, virname, hexsig, type);
+    free(binary_sig);
+    return 0;
+}
+
 static int cli_ac_addsig(struct cli_matcher *root, const char *virname, const 
char *hexsig, int sigid, int parts, int partno, unsigned short type, unsigned 
int mindist, unsigned int maxdist, char *offset, unsigned short target)
 {
        struct cli_ac_patt *new;
@@ -379,7 +402,15 @@
            return ret;
        }
 
-    } else {
+    } 
+    /* While adding signatures to BM, also add to BloomAV */
+    else
+    {
+      if (ret = cli_bloom_addsig(root, virname, hexsig, 0))
+      {
+           cli_errmsg("cli_parse_add(): Problem adding signature (3).\n");
+           return ret;
+      }
        bm_new = (struct cli_bm_patt *) calloc(1, sizeof(struct cli_bm_patt));
        if(!bm_new)
            return CL_EMEM;
@@ -490,10 +521,19 @@
                    cli_errmsg("Can't initialise BM pattern matcher\n");
                    return ret;
                }
-           }
-       }
-    }
 
+               /* Adding initialization code for BloomAV */
+               /* Yup, we should check separately for bloom filter and 
hash-table */
+               if (!root->bloom_filter) {
+               cli_dbgmsg("Initializing Bloom filter and signature table of 
root[%d]\n", i);
+               if((ret = cli_bloom_init(root))) {
+                       cli_errmsg("Can't initialise Bloom pattern matcher\n");
+                       return ret;
+               }
+               }
+        }
+    }
+    }
     return 0;
 }
 
@@ -575,7 +615,14 @@
        cl_free(*engine);
        return ret;
     }
-
+#if 0
+       /* Adding initialization code for BloomAV */
+       /* Yup, we should check separately for bloom filter and rb-tree */
+    if (!(*(*engine)->root)->bloom_filter) {
+    cli_dbgmsg("Initializing Bloom filter and signature table\n");
+       cli_bloom_init((*(*engine)->root));
+    }
+#endif
     while(fgets(buffer, FILEBUFF, fd)) {
        line++;
 
_______________________________________________
http://lurker.clamav.net/list/clamav-devel.html
Please submit your patches to our Bugzilla: http://bugs.clamav.net

Reply via email to