Update of /cvsroot/monetdb/pathfinder/backends/monet5
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv28033
Modified Files:
xquery_shredder.mx
Log Message:
Changes to the shreder so it can be compiled in M5
U xquery_shredder.mx
Index: xquery_shredder.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/backends/monet5/xquery_shredder.mx,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- xquery_shredder.mx 9 Apr 2009 09:42:50 -0000 1.1
+++ xquery_shredder.mx 9 Apr 2009 15:36:43 -0000 1.2
@@ -26,55 +26,46 @@
@'
@' $Id$
-...@f shredder
-...@a Jan Flokstra
-...@a Peter Boncz
-...@a Niels Nes
-...@t shredder
+...@f xquery_shredder
+...@a
+...@t Monet5/XQuery XML Document Shredder
+
+This shredder is based on the original implementation by Jan Flokstra, Peter
+Boncz, and Niels Nes. The original shredder can be found at:
+pathfinder/runtime/shredder.mx (snapshot tag M42M5)
@h
-#ifndef SHREDDER_H
-#define SHREDDER_H
+#ifndef _XQ_SHREDDER_H
+#define _XQ_SHREDDER_H
#define XML_DEPTH_MAX 128
-#include "serialize.h" /* for struct serializeFunStruct & struct XqueryCtx */
-#include "pf_support.h"
-
-/* for PFtijah */
-pf_support_export int shred(BAT *docBAT, str location, str buffer, stream *s,
lng percentage, struct serializeFunStruct *serFun, struct XqueryCtx *serCtx,
lock *collLock);
+#include "xquery.h"
+#include <lock.h>
-/* MIL commands to shred XML from a {uri,string,stream}. */
-pf_support_export int CMDshred_url(BAT *docBAT, str location, lng *percentage,
lock* collLock, bit *verbose);
-pf_support_export int CMDshred_str(BAT *docBAT, str buffer, lng *percentage,
lock* collLock, bit *verbose);
-pf_support_export int CMDshred_stream(BAT *docBAT, Stream *s, lng *percentage,
lock* collLock, bit *verbose);
+/* MAL commands to shred XML from a {uri,string,stream} */
+xquery_export str XQUERYShredUrl(int retval, bat *doc, str location,
+ lng *percentage, lock* collLock, bit *verbose);
+xquery_export str XQUERYShredStr(int retval, bat *doc, str buffer,
+ lng *percentage, lock* collLock, bit *verbose);
+xquery_export str XQUERYShredStream(int retval, bat *doc, stream *s,
+ lng *percentage, lock* collLock, bit *verbose);
-#endif /* SHREDDER_H */
+#endif /* _XQ_SHREDDER_H */
@c
-/*
====================================================================================
- * shredder.mx : xml document shredder for pathfinder
- *
- * Original code by Jan Flokstra.
- * Incremental stream parser addition by Niels Nes.
- * Revised by Peter Boncz(code restructuring, attempt at documentation,
multi-doc
- * collections, garbage collection, error checking,
atomicity).
- *
====================================================================================
*/
+#include "pf_config.h"
+#include <mal_exception.h>
+#include "xquery_shredder.h"
+#include "libxml/parser.h" /* SAX parser interface(libxml2) */
+#include "libxml/parserInternals.h"
+#include "temp.h" /* TEMPORY DEFINITIONS TO MAKE THE M42M5 WORK */
+#include "serialize.h" /* for struct serializeFunStruct & struct XqueryCtx */
-/* #define DEBUG */
+#define DEBUG_XQ_SHRD
#define PRINT_STAT 0
-#include <pf_config.h>
-#include <gdk.h>
-#include <monet.h>
-
-#include "pathfinder.h"
-#include "pf_support.h"
-#include "shredder.h"
-
-/**
- * XML node kinds
- */
+/* XML node kinds */
#define ELEMENT 0
#define PFTEXT 1
#define COMMENT 2
@@ -82,10 +73,6 @@
#define DOCUMENT 4
#define COLLECTION 5
-/* SAX parser interface(libxml2) */
-#include "libxml/parser.h"
-#include "libxml/parserInternals.h"
-
char*
timer_str(lng elapsed, char* buf)
{
@@ -114,34 +101,37 @@
return buf;
}
-/*
====================================================================================
- * attribute DB to distinguish attrs that have ID/IDREF semantics(filled
during DTD parse)
- * - attrDB_init() initialize the attribute database
- * - attrDB_find() find if an attribute is already registered in the
database
- * - attrDB_insert() register an attribute as an ID/IDREF attribute
- * - attrDB_check() check if this attribute is an ID/IDREF attribute
- * - attrDB_free() free resources
- *
====================================================================================
*/
+...@- Attribute DB
+
+We build an attribute DB to distinguish attrs that have ID/IDREF semantics. It
+is populated during DTD parse. The following functions are available:
+
+ - attrDB_init() initialize the attribute database
+ - attrDB_find() find if an attribute is already registered in the database
+ - attrDB_insert() register an attribute as an ID/IDREF attribute
+ - attrDB_check() check if this attribute is an ID/IDREF attribute
+ - attrDB_free() free resources
+
+...@c
/* #define ADB_DEBUG */
/* #define ADB_USE_NAMES */
-
#define ADB_STARTSIZE 256
typedef struct specAttrStr {
char* ownerTag;
char* attrTag;
- int type; /* XML_ATTRIBUTE_[ID|IDREF|IDREFS] */
+ int type; /* XML_ATTRIBUTE_[ID|IDREF|IDREFS] */
} specAttrStr;
typedef struct idrefAttrDBStr {
- int doCheck; /* boolean to signal if the attr db is in use */
+ int doCheck; /* boolean to signal if the attr db is in use */
int n_specAttr;
int max_specAttr;
specAttrStr* specAttr;
} idrefAttrDBStr;
-static void
+static void
attrDB_init(idrefAttrDBStr* adb)
{
#ifdef ADB_DEBUG
@@ -150,48 +140,43 @@
adb->n_specAttr = adb->max_specAttr = 0;
adb->specAttr = 0;
#ifdef ADB_USE_NAMES
- /* in this case even when no attributes are encountered the db is in use.
*/
+ /* in this case even when no attributes are encountered the db is in use */
adb->doCheck = 1;
#else
adb->doCheck = 0;
#endif
}
-static specAttrStr*
-attrDB_find(idrefAttrDBStr *adb,
- char *oTag,
- char *aTag)
+static specAttrStr*
+attrDB_find(idrefAttrDBStr *adb, char *oTag, char *aTag)
{
int i;
for(i=0; i<adb->n_specAttr; i++) {
specAttrStr *sas = &adb->specAttr[i];
if ((strcmp(oTag, sas->ownerTag) == 0) &&
- (strcmp(aTag, sas->attrTag) == 0)) {
-#ifdef ADB_DEBUG
- /* stream_printf(GDKout, "[ADB] found(%s, %s, %d)\n",
sas->ownerTag, sas->attrTag, sas->type); */
-#endif
+ (strcmp(aTag, sas->attrTag) == 0)) {
return sas;
- };
+ }
}
return NULL;
}
-static int
-attrDB_check(idrefAttrDBStr *adb,
- char *oTag,
- char *aTag)
+static int
+attrDB_check(idrefAttrDBStr *adb, char *oTag, char *aTag)
{
- if (adb->doCheck) { /* check if db is in use */
+ if (adb->doCheck) { /* check if db is in use */
specAttrStr *sas;
if ((sas = attrDB_find(adb, oTag, aTag)))
return sas->type;
else {
#ifdef ADB_USE_NAMES
- if ((strcmp(aTag, "id") == 0) ||(strcmp(aTag, "ID") == 0))
+ if ((strcmp(aTag, "id") == 0) || (strcmp(aTag, "ID") == 0))
return XML_ATTRIBUTE_ID;
- else if ((strcmp(aTag, "idref") == 0) ||(strcmp(aTag, "IDREF") ==
0))
+ else if ((strcmp(aTag, "idref") == 0) ||
+ (strcmp(aTag, "IDREF") == 0))
return XML_ATTRIBUTE_IDREF;
- else if ((strcmp(aTag, "idrefs") == 0) ||(strcmp(aTag, "IDREFS")
== 0))
+ else if ((strcmp(aTag, "idrefs") == 0) ||
+ (strcmp(aTag, "IDREFS") == 0))
return XML_ATTRIBUTE_IDREFS;
#endif
}
@@ -199,17 +184,15 @@
return 0;
}
-static int
-attrDB_insert(idrefAttrDBStr *adb,
- char *oTag,
- char *aTag,
- int type)
+static int
+attrDB_insert(idrefAttrDBStr *adb, char *oTag, char *aTag, int type)
{
specAttrStr *sas = NULL;
- adb->doCheck = 1; /* signal database is in use */
+ adb->doCheck = 1; /* signal database is in use */
if (attrDB_find(adb, oTag, aTag)) {
- GDKerror("attrDB_insert: idrefAttrDB(%s, %s, %d) is duplicate.\n",
oTag, aTag, type);
+ GDKerror("attrDB_insert: idrefAttrDB(%s, %s, %d) is duplicate.\n",
+ oTag, aTag, type);
return GDK_FAIL;
}
oTag = GDKstrdup(oTag);
@@ -243,7 +226,7 @@
return GDK_FAIL;
}
-static void
+static void
attrDB_free(idrefAttrDBStr *adb)
{
int i;
@@ -263,10 +246,9 @@
#endif
}
-/*
====================================================================================
- * main shredder data structures(the shredCtx, containing the shredBATdefs)
- *
====================================================================================
*/
+...@- The main shredder data structure (the shredCtx, containing the
shredBATdefs)
+...@c
/* define the number of BATs in the requuired result set */
#define SHRED_BATS (ATTR_PROP+1)
@@ -312,7 +294,7 @@
oid dupl; /* the number of duplicate BUNins so far */
BUN size; /* current maximum size of the BAT */
BAT *bat; /* the physical BAT */
- BUN abortSize; /* size previous to shredding */
+ BUN abortSize; /* size previous to shredding */
size_t heapSize; /* size of string tail heap(->free) */
void *heapBackup; /* image of the internal str-heap hash-table */
/* */
@@ -339,11 +321,11 @@
*/
typedef struct node_t node_t;
struct node_t {
- oid pre; /**< preorder rank */
- oid nid; /**< stable nid */
- int level; /**< tree level of parent(0 if root, -1 if
document node) */
- oid prop; /**< property ID */
- chr kind; /**< node kind */
+ oid pre; /* preorder rank */
+ oid nid; /* stable nid */
+ int level; /* tree level of parent(0 if root, -1 if document node) */
+ oid prop; /* property ID */
+ chr kind; /* node kind */
};
/* the computing context environmentf the shredding process */
@@ -362,7 +344,7 @@
int sp; /* the stackpointer */
/* buffer administration */
int content; /* content ptr in char buff */
- int content_max; /* max size of content buffer */
+ int content_max; /* max size of content buffer */
char *content_buf; /* character buffer */
/* */
size_t fileSize; /* size of the input file */
@@ -371,12 +353,12 @@
BUN pageFree; /* number of tuples per page to leave empty */
/* attributes for shred into collection */
- int incremental; /* boolean: incremental shred? */
- int updatable; /* boolean: is nid-rid-pre the identity
mapping? */
- char *val; /* temporary PFSHRED_BUFLEN-byte string
workspace */
-
+ int incremental; /* boolean: incremental shred? */
+ int updatable; /* boolean: is nid-rid-pre the identity mapping? */
+ char *val; /* temporary PFSHRED_BUFLEN-byte string workspace */
+
/* the database containing the ID|IDREF|IDREFS attributes */
- idrefAttrDBStr idrefAttrDB;
+ idrefAttrDBStr idrefAttrDB;
char* base_dir; /* base location of the shredded doc. Used to
* locate relative external subsets */
@@ -401,17 +383,17 @@
#define PFSHRED_BUFLEN (1 << 10)
#define XML_TAG_MAX PFSHRED_STRLEN_MAX
-/*
====================================================================================
- * shredder low-level routines that add data to bats, while parsing
- * - shredBAT_newsize() compute a suitable new bat size
- * - shredBAT_extend() low-level way of increasing the logical size of a
bat
- * - shredBAT_setcount() make more room in a BAT(using BATextend)
- * - shredBAT_append_str() append into a BAT[void,str], possibly with
compression(double-elim)
- *
====================================================================================
*/
-
-static void
-shredBAT_setcount(shredBAT *sb,
- oid n)
+...@- Shredder low-level routines that add data to bats durring parsing
+ - shredBAT_newsize() compute a suitable new bat size
+ - shredBAT_extend() low-level way of increasing the logical size of a bat
+ - shredBAT_setcount() make more room in a BAT(using BATextend)
+ - shredBAT_append_str() append into a BAT[void,str], possibly with
+ compression(double-elim)
+
+...@c
+
+static void
+shredBAT_setcount(shredBAT *sb, oid n)
{
BAT *b = sb->bat;
BATsetcount(b, n);
@@ -423,11 +405,11 @@
}
-static BUN
+static BUN
shredBAT_newsize(shredBAT *sb, BUN newsize)
{
newsize = MAX(sb->size, newsize + MAX(BATTINY, newsize/4));
- if (sb->def->table == PRE_SIZE && (newsize & REMAP_PAGE_MASK)) {
+ if (sb->def->table == PRE_SIZE && (newsize & REMAP_PAGE_MASK)) {
/* align rid bats on the page-size */
newsize += REMAP_PAGE_SIZE - (newsize & REMAP_PAGE_MASK);
}
@@ -439,16 +421,16 @@
if (newsize > INT_MAX) {
newsize = INT_MAX;
if (sb->size == INT_MAX) {
- GDKerror("shredBAT_newsize: %s cannot be extended beyond %d
(INT_MAX)\n", sb->def->name, INT_MAX);
+ GDKerror("shredBAT_newsize: %s cannot be extended beyond %d "
+ "(INT_MAX)\n", sb->def->name, INT_MAX);
return 0;
}
}
return newsize;
}
-static int
-shredBAT_extend(shredBAT *sb,
- BUN newsize)
+static int
+shredBAT_extend(shredBAT *sb, BUN newsize)
{
shredBAT_setcount(sb, sb->size);
sb->size = newsize;
@@ -459,18 +441,16 @@
BATmmap(sb->bat, STORE_MMAP, STORE_MMAP, STORE_MMAP, STORE_MMAP);
}
if (!(sb->bat = BATextend(sb->bat, newsize))) {
- GDKerror("shredBAT_extend: BATextend[\"%s\"](%d to %d) fails\n",
sb->def->name, sb->size, newsize);
+ GDKerror("shredBAT_extend: BATextend[\"%s\"](%d to %d) fails\n",
+ sb->def->name, sb->size, newsize);
return GDK_FAIL;
}
sb->cast.voidCAST =(void*)Tloc(sb->bat, BUNfirst(sb->bat));
return GDK_SUCCEED;
}
-
-static INLINE oid
-shredBAT_append_str(shredCtxStruct *shredCtx,
- int idx,
- str v)
+static INLINE oid
+shredBAT_append_str(shredCtxStruct *shredCtx, int idx, str v)
{
shredBAT *sb = & shredCtx->dstBAT[idx];
BUN oldsize, newsize;
@@ -498,31 +478,27 @@
oldsize = sb->bat->T->heap.free & (REMAP_PAGE_MAXSIZE-1);
if (!BUNappend(sb->bat,(ptr)v, TRUE)) {
- GDKerror("shredBAT_append_str: APPEND-STR[%s](%s), BUNappend fails\n",
sb->def->name, v);
+ GDKerror("shredBAT_append_str: APPEND-STR[%s](%s), BUNappend fails\n",
+ sb->def->name, v);
return oid_nil;
}
newsize = sb->bat->T->heap.free & (REMAP_PAGE_MAXSIZE-1);
if (oldsize > newsize) { /* try to use mmap() */
- BATmmap(sb->bat, STORE_MMAP, STORE_MMAP, STORE_MMAP,
GDK_ELIMDOUBLES(sb->bat->theap)?STORE_MEM:STORE_MMAP);
+ BATmmap(sb->bat, STORE_MMAP, STORE_MMAP, STORE_MMAP,
+ GDK_ELIMDOUBLES(sb->bat->theap)?STORE_MEM:STORE_MMAP);
}
return BATcount(sb->bat)-1;
}
-/*
====================================================================================
- * shredder helper functions: emitting nodes
- * - emit_tuple() inserts a tuple in the RID bats
- * - emit_node() emit a node (calls emit_tuple)
- * - emit_string() emit a text node (calls emit_node)
- *
====================================================================================
*/
+...@- Shredder helper functions: emitting nodes
+ - emit_tuple() inserts a tuple in the RID bats
+ - emit_node() emit a node (calls emit_tuple)
+ - emit_string() emit a text node (calls emit_node)
+...@c
static INLINE int
-emit_tuple(shredCtxStruct* shredCtx,
- oid pre,
- int size,
- int level,
- oid prop,
- chr kind,
- oid nid)
+emit_tuple(shredCtxStruct* shredCtx, oid pre, int size, int level, oid prop,
+ chr kind, oid nid)
{
if (size >= GDK_int_max) {
GDKerror("emit_tuple: node.size >= GDK_int_max\n");
@@ -554,15 +530,13 @@
if (pre >= shredCtx->dstBAT[PRE_SIZE].size) {
BUN newsize = shredBAT_newsize(&shredCtx->dstBAT[PRE_SIZE], pre);
int err = 0;
- MT_sema_down(shredCtx->extend_sema, "shredder_extend");
- err =
+ err =
!(newsize &&
shredBAT_extend(&shredCtx->dstBAT[PRE_SIZE], newsize) &&
shredBAT_extend(&shredCtx->dstBAT[PRE_LEVEL], newsize) &&
shredBAT_extend(&shredCtx->dstBAT[PRE_PROP], newsize) &&
shredBAT_extend(&shredCtx->dstBAT[PRE_KIND], newsize) &&
(shredCtx->updatable == 0 ||
shredBAT_extend(&shredCtx->dstBAT[PRE_NID], newsize)));
- MT_sema_up(shredCtx->extend_sema, "shredder_extend");
if (err) return GDK_FAIL;
}
/* insert the new RID tuple */
@@ -1644,13 +1618,6 @@
shredCtx->serFun = serFun;
shredCtx->serCtx = serCtx;
shredCtx->coll_lock = coll_lock;
- {
- /* avoid "dereferencing type-punned pointer will break strict-aliasing
rules" with gcc >= 4.1 */
- /* (void) CMDpflock_get((ptr*) &(shredCtx->extend_sema), &i); */
- ptr s = (ptr)shredCtx->extend_sema;
- (void) CMDpflock_get(&s, &i);
- shredCtx->extend_sema = s;
- }
shredCtx->incremental = (BATcount(docBAT) > 0);
/* as we want to release the lock once in a while, but having the lock
alone does not guarantee
@@ -1971,14 +1938,13 @@
}
-/*
====================================================================================
- * the implemented MIL commands:
- * - shred() the routine that implements all
- * - CMDshred_url() MIL command that shreds from a url
- * - CMDshred_str() MIL command that shreds from a string
- * - CMDshred_stream() MIL command that shreds from a stream
- *
====================================================================================
*/
+...@- The implemented MAL commands
+
+- shred_url() MAL command that shreds from a url
+- shred_str() MAL command that shreds from a string
+- CMDshred_stream() MAL command that shreds from a stream
+...@c
#define DEFAULT_ESTIMATE 1048576
/* function which tries to determine the base location of a document from
@@ -2076,38 +2042,72 @@
return res;
}
-int
-CMDshred_url(BAT *docBAT,
- str location,
- lng *percentage,
- lock *collLock,
- bit *verbose)
+str
+XQUERYShredUrl (int retval, bat *doc, str location, lng *percentage,
+ lock* collLock, bit *verbose)
{
+ int res;
+ BAT *docBAT;
+ (void) retval;
+
assert(percentage);
- return shred(docBAT, location, NULL, NULL, *percentage, NULL, NULL,
(*verbose==TRUE)?NULL:collLock);
+
+ if ((docBAT = BATdescriptor(*doc)) == NULL) {
+ throw(XQUERY, "xquery.shred_url", "wrong doc BAT");
+ }
+
+ res = shred(docBAT, location, NULL, NULL, *percentage, NULL, NULL,
+ (*verbose==TRUE)?NULL:collLock);
+
+ if (res == GDK_FAIL)
+ throw(XQUERY, "xquery.shred_url", "unable to shred the specified url");
+ return MAL_SUCCEED;
}
-int
-CMDshred_str(BAT *docBAT,
- str buffer,
- lng *percentage,
- lock *collLock,
- bit *verbose)
+str
+XQUERYShredStr (int retval, bat *doc, str buffer, lng *percentage,
+ lock* collLock, bit *verbose)
{
+ int res;
+ BAT *docBAT;
+ (void) retval;
+
assert(percentage);
- return shred(docBAT, NULL, buffer, NULL, *percentage, NULL, NULL,
(*verbose==TRUE)?NULL:collLock);
+
+ if ((docBAT = BATdescriptor(*doc)) == NULL) {
+ throw(XQUERY, "xquery.shred_str", "wrong doc BAT");
+ }
+
+ res = shred(docBAT, NULL, buffer, NULL, *percentage, NULL, NULL,
+ (*verbose==TRUE)?NULL:collLock);
+
+ if (res == GDK_FAIL)
+ throw(XQUERY, "xquery.shred_str",
+ "unable to shred the specified string");
+ return MAL_SUCCEED;
}
-int
-CMDshred_stream(BAT *docBAT,
- Stream *fp,
- lng *percentage,
- lock *collLock,
- bit *verbose)
+str
+XQUERYShredStream (int retval, bat *doc, stream *s, lng *percentage,
+ lock* collLock, bit *verbose)
{
- stream *s = (stream*) ((fp && *(ptr*) fp != ptr_nil)?*fp:NULL);
+ int res;
+ BAT *docBAT;
+ (void) retval;
+
assert(percentage);
- return shred(docBAT, NULL, NULL, s, *percentage, NULL, NULL,
(*verbose==TRUE)?NULL:collLock);
+
+ if ((docBAT = BATdescriptor(*doc)) == NULL) {
+ throw(XQUERY, "xquery.shred_stream", "wrong doc BAT");
+ }
+
+ res = shred(docBAT, NULL, NULL, s, *percentage, NULL, NULL,
+ (*verbose==TRUE)?NULL:collLock);
+
+ if (res == GDK_FAIL)
+ throw(XQUERY, "xquery.shred_stream",
+ "unable to shred the specified stream");
+ return MAL_SUCCEED;
}
-...@c
+
/* vim:set shiftwidth=4 expandtab: */
------------------------------------------------------------------------------
This SF.net email is sponsored by:
High Quality Requirements in a Collaborative Environment.
Download a free trial of Rational Requirements Composer Now!
http://p.sf.net/sfu/www-ibm-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins