xml-shred xml-shred.c, 1.1, 1.2

Manuel Mayr Tue, 08 May 2007 01:33:43 -0700

Update of /cvsroot/monetdb/pathfinder/src/tools/xml-shred
In directory sc8-pr-cvs16:/tmp/cvs-serv7011


Modified Files:
        xml-shred.c 
Log Message:


We would like to introduce a new variant of the pre/size/level encoding
to suppport attributes in the SQL backend.

What we have now are a (1) pre/size/level/kind/name_id/value
and a (2) name_id/name table.

name_id/name in table (2) is used to store the name of tags or attributes.
The value field stores the content of attributes or
textnodes. name_id in (1) is a foreign key that references to name_id (2).

If you want to use this encoding you have to use the shredder with the option 
-s.


Index: xml-shred.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/src/tools/xml-shred/xml-shred.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- xml-shred.c 4 May 2007 09:25:54 -0000       1.1
+++ xml-shred.c 8 May 2007 08:32:40 -0000       1.2
@@ -41,10 +41,12 @@
 #include "libxml/parserInternals.h"
 
 #define STACK_MAX 100
-#define PROPSIZE 32
+#define PROPSIZE 31999 
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 
+#define NAME_ID 0
+
 typedef long int nat;
 
 enum kind_t {
@@ -57,6 +59,77 @@
 };
 typedef enum kind_t kind_t;
 
+/* definitions for our hashtable */
+
+/* code for no key */
+#define NO_KEY -1 
+
+/* returns true if no such key is found */
+#define NOKEY(k) (k == -1)
+
+
+/**
+ * Compression function
+ * We use a universal hash function
+ */
+#define MAD(key) (((123 * key + 593) % PRIME) % HASHTABLE_SIZE)
+
+/* size of the hashtable */
+#define HASHTABLE_SIZE 2000 
+
+/* prime number due to bertrands theorem:
+ * there exists a prime number p that satisfy,
+ * the following condition
+ *     HASHTABLE _SIZE < p <= 2 HASHTABLE
+ */
+#define PRIME 2011
+
+/* 33 has proved  to be a good choice
+ * for polynomial hash functions
+ */
+#define POLY_A 33
+
+/**
+ * Hashfunction
+ */
+int hashfunc(char *str); 
+
+#define HASHFUNCTION(str) MAD(hashfunc(strndup(str, MIN(strlen(str), 10))))
+
+/* We use a seperate chaining strategy to
+ * mantain out hash_table,
+ * So our bucket is a chained list itself,
+ * to handle possible collisions.
+ */
+typedef struct bucket_t bucket_t;
+struct bucket_t
+{
+    char *key;               /**< key as string */
+    int id;         /**< name_id */
+    bucket_t* next;          /**< next bucket in our list */
+};
+
+/* hash table */
+bucket_t **hash_table;
+
+/* find element in bucket */
+int find_bucket(bucket_t *bucket, char *key);
+
+/* find element in hashtable */
+int find_element(bucket_t **hash_table, char *key);
+
+/* add id and key to the bucket list */
+bucket_t *bucket_insert(bucket_t *bucket,  char *key, int id);
+
+/* insert key and id to hashtable */
+void hashtable_insert(bucket_t **hash_table, char *key, int id);
+
+/* free memory assigned to hash_table */
+void free_hash(bucket_t **hash_table);
+
+/* return a brand new name_id */
+int new_nameid();
+
 typedef struct node_t node_t;
 struct node_t {
     nat       pre;
@@ -65,6 +138,7 @@
     nat       post_stretched;
     nat       size;
     int       level;
+    int       name_id;
     node_t   *parent;
     kind_t    kind;
     xmlChar  *prop;
@@ -78,7 +152,7 @@
 static nat rank;
 static nat att_id;
 
-static char *format = "%e, %o, %p, %k, %t";
+static char *format = "%e, %s, %l, %k, %t";
 FILE *out;
 FILE *out_attr;
 char  filename[FILENAME_MAX];
@@ -87,6 +161,7 @@
 char  outfile_atts[FILENAME_MAX];
 bool  outfile_given = false;
 bool  suppress_attributes = false;
+bool  sql_atts = false;
 
 static void print_tuple (node_t tuple);
 static void flush_buffer (void);
@@ -154,6 +229,24 @@
 
     assert (level < STACK_MAX);
 
+    /* try to find the tagname in the
+     * hashtable */
+    int name_id = -1; 
+    if(sql_atts) { 
+       name_id = find_element(hash_table, (char*)tagname);
+
+       /* key not found */
+       if (NOKEY(name_id)) {
+
+           /* create a new id */
+           name_id = new_nameid();     
+
+           hashtable_insert(hash_table, (char*)tagname, name_id);
+
+           fprintf (out_attr, "%i, \"%s\"\n", name_id, 
strndup((char*)tagname,PROPSIZE));
+       }
+    }
+
     stack[level] = (node_t) {
         .pre            = pre,
         .post           = 0,
@@ -162,19 +255,53 @@
         .size           = 0,
         .level          = level,
         .parent         = stack + level - 1,
+       .name_id        = name_id, 
         .kind           = elem,
-        .prop           = strndup (tagname, PROPSIZE)
+        .prop           = NULL 
     };
 
+
     /*
      * FIXME: handle attributes here
      */
     if (!suppress_attributes && atts)
-        while (*atts) {
-            fprintf (out_attr, "%lu, %lu, \"%s\", \"%s\"\n", att_id++, pre,
-                     atts[0], atts[1]);
-            atts += 2;
-        }
+        if (!sql_atts)
+           while (*atts) {
+               fprintf (out_attr, "%lu, %lu, \"%s\", \"%s\"\n", att_id++, pre,
+                        atts[0], atts[1]);
+               atts += 2;
+           }
+       /* handle attributes as we need for sql generation */
+       else
+          while (*atts) {
+               /* try to find the tagname in the
+                * hashtable */
+               name_id = find_element(hash_table, (char*)tagname);
+
+               /* key not found */
+               if(NOKEY(name_id)) {
+
+                   /* create a new id */
+                   name_id = new_nameid();     
+
+                      printf("NOKEY, %i\n", name_id);
+                   hashtable_insert(hash_table, (char*)atts[0], name_id);
+               }
+
+               pre++;
+               print_tuple ((node_t) {
+                       .pre = pre,
+                       .post = 0,
+                       .pre_stretched = 0,
+                       .post_stretched = 0,
+                       .size = 0,
+                       .level = 1,
+                       .parent = 0,
+                       .name_id = name_id,
+                       .kind = attr,
+                       .prop = (char*)atts[1]});
+               atts += 2;
+          }
 }
 
 static void
@@ -226,6 +353,7 @@
             .size           = 0,
             .level          = level,
             .parent         = stack + level - 1,
+           .name_id        = -1,
             .kind           = text,
             .prop           = buf,
         };
@@ -275,18 +403,21 @@
 static void
 print_help (int argc, char **argv)
 {
-    printf ("%s - encode XML document in pre/post encoding\n"
+    printf ("%s - encode XML document in different encodings\n"
             "Usage: %s -h             print this help screen\n"
             "       %s -f <filename>  parse XML file <filename>\n"
             "       %s -o <filename>  output filename\n"
-            "       %s -a             suppress attributes\n",
-            argv[0], argv[0], argv[0], argv[0], argv[0]);
+            "       %s -a             suppress attributes\n"
+            "       %s -s             sql encoding supported by pathfinder\n"
+           "                         \t\t(that is probably what you want)\n",
+            argv[0], argv[0], argv[0], argv[0], argv[0], argv[0]);
 }
 
 static void
 print_tuple (node_t tuple)
 {
-    for (unsigned int i = 0; format[i]; i++)
+    unsigned int i;
+    for (i = 0; format[i]; i++)
         if (format[i] == '%') {
             i++;
             switch (format[i]) {
@@ -319,22 +450,34 @@
                                default: assert (0);
                            }
                            break;
+               case 'n':
+                       if (tuple.name_id == -1)
+                           fprintf(out, "NULL");
+                       else 
+                           fprintf(out, "%i", tuple.name_id); break;
                 case 't':
-                    putc ('"', out);
-                    for (unsigned int i = 0; i < PROPSIZE && tuple.prop[i]; 
i++)
-                        switch (tuple.prop[i]) {
-                            case '\n': putc (' ', out); break;
+               {
+                   if(tuple.prop) {
+                       unsigned int i;
+                       putc ('"', out);
+                       for (i = 0; i < PROPSIZE && tuple.prop[i]; i++)
+                           switch (tuple.prop[i]) {
+                               case '\n': putc (' ', out); break;
                             case '"':  putc ('"', out); putc ('"', out);
                                        break;
                             default:   putc (tuple.prop[i], out);
                         }
-                    putc ('"', out);
-                    break;
+                       putc ('"', out);
+                   }
+                   else {
+                       fprintf(out, "NULL");
+                   }
+               } break;    
 
                 default:   putc (format[i], out); break;
             }
-        }
-        else
+           }
+           else
             putc (format[i], out);
 
     putc ('\n', out);
@@ -348,9 +491,11 @@
     suppress_attributes = false;
     outfile_given = false;
 
+    
+
     /* parse command line using getopt library */
     while (true) {
-        int c = getopt (argc, argv, "F:af:ho:");
+        int c = getopt (argc, argv, "F:af:ho:s");
 
         if (c == -1)
             break;
@@ -375,6 +520,11 @@
                 outfile_given = true;
                 break;
 
+           case 's':
+               sql_atts = true;
+               format = "%e, %s, %l, %k, %n, %t";
+               break;
+
             case 'h':
                 print_help (argc, argv);
                 exit (0);
@@ -382,6 +532,10 @@
         }
     }
 
+    /* if we need sql encoding we need to initialize the hashtable */
+    if(sql_atts)
+       hash_table = (bucket_t**) malloc (HASHTABLE_SIZE * sizeof(bucket_t));
+
     if (!outfile_given && !suppress_attributes) {
         fprintf (stderr, "Attribute generation requires output filename.\n");
         print_help (argc, argv);
@@ -409,6 +563,8 @@
         exit (EXIT_FAILURE);
     }
 
+
+
     /* start XML parsing */
     ctx = xmlCreateFileParserCtxt (filename);
     ctx->sax = &saxhandler;
@@ -417,10 +573,109 @@
 
     if (! ctx->wellFormed) {
         fprintf (stderr, "XML input not well-formed\n");
+       if (sql_atts)
+           free_hash(hash_table);
         exit (EXIT_FAILURE);
     }
 
     fprintf (stderr, "tree height was %i\n", max_level);
+    if (sql_atts)
+        fprintf(stderr, "There are %i tagnames and attribute names in the 
document\n", --new_nameid());
 
+    if (sql_atts)
+       free_hash(hash_table);
     return 0;
 }
+
+/**
+ * Hashfunction
+ * You should use the macro #HASHFUNCTON 
+ * to apply the the function only to a
+ * fragment of the string
+ */
+int hashfunc(char *str)
+{
+    /* appliing horners rule */
+    int x;
+    int k = strlen(str);
+    k--; 
+
+    x = (int)str[k]-'a';
+    if(k == 0) {
+        return x % PRIME;
+    }
+    return (x + POLY_A * hashfunc(strndup(str, k))) % PRIME; 
+}
+
+
+
+/* find element in bucket */
+int find_bucket(bucket_t *bucket, char *key)
+{
+    bucket_t *actbucket = bucket;
+    while (actbucket)
+    {
+        if (strcmp(actbucket->key, key)==0)
+       {
+          return actbucket->id;
+       }
+       else
+           actbucket = actbucket->next;
+    }
+    return NO_KEY;
+}
+
+/* find element in hashtable */
+int find_element(bucket_t **hash_table, char *key)
+{
+    return find_bucket(hash_table[HASHFUNCTION(key)],key);
+}
+
+/* return a brand new name_id */
+int new_nameid()
+{
+    static unsigned int name = NAME_ID;
+    return name++;
+}
+
+/* add id and key to the bucket list */
+bucket_t *bucket_insert(bucket_t *bucket,  char *key, int id)
+{
+    int ident = find_bucket(bucket, key);
+    bucket_t *actbucket = NULL;
+    /* no key found */
+    if( ident == -1) {
+       actbucket = (bucket_t*) malloc(sizeof(bucket_t));
+       actbucket->id = id;
+       actbucket->key = strndup(key,strlen(key));
+       /* add actbucket to the front of list */
+       actbucket->next = bucket;
+       return actbucket;
+    }
+    else {
+       return bucket;
+    }
+    /* satisfy picky compilers */
+    return NULL;
+}
+
+/* insert key and id into hashtable*/
+void hashtable_insert(bucket_t **hash_table, char *key, int id)
+{
+     assert (hash_table != NULL);
+     int hashkey = HASHFUNCTION(key);
+     hash_table[hashkey] = bucket_insert(hash_table[hashkey], key, id); 
+     return;
+}
+
+/* free memory assigned to hash_table */
+void free_hash(bucket_t **hash_table)
+{
+   assert (hash_table != NULL);
+   int i = 0;
+   if(hash_table) return;
+
+   for(i = 0; i < HASHTABLE_SIZE; i++) {
+        free(hash_table[i]);
+   }
+}


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

[Monetdb-pf-checkins] pathfinder/src/tools/xml-shred xml-shred.c, 1.1, 1.2

Reply via email to