Daniel Veillard wrote:
> On Fri, Jan 25, 2008 at 02:39:22PM +0100, Julien Charbon wrote:
>> Thus:
>> "&myent;" -> (1) -> "&myent;" -> (2) -> "&myent;"
>
> argh, right .... I'm afraid the escaping has been added as an afterthought
> it was not supposed to be that way, oh well, one can still build the
> complex attrubute values 'by hand' with the help of the API, but I think
> somehow we defeated the initial purpose for the xmlStringGetNodeList() call
>
>> It's give to me with current libxml2 trunk:
>> $ gcc test-xml-tiny.c -o test-xml-tiny $(xml2-config --cflags) \
>> $(xml2-config --libs)
>> $ ./test-xml-tiny
>> Only one element in return of xmlStringGetNodeList
>> &foo; &bar; & <tag> &myent </tag> &&
>>
>> No change. Maybe, historically, it was not always the case...
>
> Hum, yes. The only other thing that your suggested change would loose
> are the error message resulting from the validations occuring in
> xmlEncodeEntitiesReentrant() , problems reported there would go unnoticed
> otherwise. Is that still worth the extra complexity or not, I'm not sure.
Hum, sure, UTF-8 validation shall not be removed. Anyway to evaluate
this extra complexity, I made the simple program [see below] that do
1000 iterations of xmlSetProp(node, name, value) and calculate the sum
of all these calls with various 'value' parameter:
- With current libxml2 trunk , on my laptop:
[Time is given in second:nanosecond format]
$ ./test-setprop "value"
Time: 0:000980992
$ ./test-setprop "&myent;"
Time: 0:001219099
$ ./test-setprop "&コンニチハ"
Time: 0:004148401
- With patch below [not a final patch, just a patch that verify the
UTF-8 validity of value, and directly add the XML_TEXT_NODE]
$ ./test-setprop "value"
Time: 0:000794779
$ ./test-setprop "&myent;"
Time: 0:000793037
$ ./test-setprop "&コンニチハ"
Time: 0:000819009
Thus on my machine it is always faster, and depending on complexity
of 'value' passed to xmlSetProp(), improvement vary between 20% and
80%. Because measured times are very short[< 1ms], their standard
deviation are high, but these numbers give an order of magnitude of
removed extra complexity.
test-setprop.c is compiled like:
$ gcc -Wall test-setprop.c -o test-setprop \
$(xml2-config --cflags) $(xml2-config --libs) -lrt
== test-setprop.c ==
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <libxml/tree.h>
#define NAME "name"
#define ITER 1000
static void usage(const char *progname) {
printf("Usage : %s <attribut-value>\n", progname);
}
struct timespec diff(struct timespec start, struct timespec end)
{
struct timespec time;
if ((end.tv_nsec-start.tv_nsec)<0) {
time.tv_sec = end.tv_sec-start.tv_sec-1;
time.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
} else {
time.tv_sec = end.tv_sec-start.tv_sec;
time.tv_nsec = end.tv_nsec-start.tv_nsec;
}
return time;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
usage(argv[0]);
return(1);
}
LIBXML_TEST_VERSION;
xmlDocPtr doc = NULL;
xmlNodePtr root_node = NULL;
struct timespec start, end, time;
doc = xmlNewDoc(BAD_CAST "1.0");
root_node = xmlNewNode(NULL, BAD_CAST "root");
xmlDocSetRootElement(doc, root_node);
unsigned long long result_sec = 0;
unsigned long long result_nsec = 0;
unsigned int i;
for(i = 0; i < ITER; i++) {
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
xmlSetProp(root_node, (unsigned char*)NAME, (unsigned char*)argv[1]);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
time = diff(start,end);
result_sec += time.tv_sec;
result_nsec += time.tv_nsec;
}
printf("Time:\t%lld:%09lld\n", result_sec, result_nsec);
xmlFreeDoc(doc);
return 0;
}
== test-setprop.c ==
Patch applied to current libxml2 trunk:
== tree.c.patch ==
Index: include/libxml/xmlerror.h
===================================================================
--- include/libxml/xmlerror.h (revision 3681)
+++ include/libxml/xmlerror.h (working copy)
@@ -398,6 +398,7 @@
XML_TREE_INVALID_HEX = 1300,
XML_TREE_INVALID_DEC, /* 1301 */
XML_TREE_UNTERMINATED_ENTITY, /* 1302 */
+ XML_TREE_NOT_UTF8, /* 1303 */
XML_SAVE_NOT_UTF8 = 1400,
XML_SAVE_CHAR_INVALID, /* 1401 */
XML_SAVE_NO_DOCTYPE, /* 1402 */
Index: tree.c
===================================================================
--- tree.c (revision 3681)
+++ tree.c (working copy)
@@ -92,6 +92,9 @@
case XML_TREE_UNTERMINATED_ENTITY:
msg = "unterminated entity reference %15s\n";
break;
+ case XML_TREE_NOT_UTF8:
+ msg = "string is not in UTF-8\n";
+ break;
default:
msg = "unexpected error number\n";
}
@@ -1814,11 +1817,13 @@
cur->name = name;
if (value != NULL) {
- xmlChar *buffer;
xmlNodePtr tmp;
- buffer = xmlEncodeEntitiesReentrant(doc, value);
- cur->children = xmlStringGetNodeList(doc, buffer);
+ if(!xmlCheckUTF8(value)) {
+ xmlTreeErr(XML_TREE_NOT_UTF8, (xmlNodePtr) doc,
+ NULL);
+ }
+ cur->children = xmlNewDocText(doc, value);
cur->last = NULL;
tmp = cur->children;
while (tmp != NULL) {
@@ -1827,7 +1832,6 @@
cur->last = tmp;
tmp = tmp->next;
}
- xmlFree(buffer);
}
/*
@@ -6466,11 +6470,13 @@
prop->last = NULL;
prop->ns = ns;
if (value != NULL) {
- xmlChar *buffer;
xmlNodePtr tmp;
- buffer = xmlEncodeEntitiesReentrant(node->doc, value);
- prop->children = xmlStringGetNodeList(node->doc, buffer);
+ if(!xmlCheckUTF8(value)) {
+ xmlTreeErr(XML_TREE_NOT_UTF8, (xmlNodePtr) node->doc,
+ NULL);
+ }
+ prop->children = xmlNewDocText(node->doc, value);
prop->last = NULL;
tmp = prop->children;
while (tmp != NULL) {
@@ -6479,7 +6485,6 @@
prop->last = tmp;
tmp = tmp->next;
}
- xmlFree(buffer);
}
if (prop->atype == XML_ATTRIBUTE_ID)
xmlAddID(NULL, node->doc, value, prop);
== tree.c.patch ==
--
Julien
_______________________________________________
xml mailing list, project page http://xmlsoft.org/
[email protected]
http://mail.gnome.org/mailman/listinfo/xml