Hello all, A few weeks back I was trying to use the libxml2's htmlReadFd() function to parse html from a file descriptor. However, it only parsed the top-level tags - any child tags were null. This is because the htmlReadFd() is using html and xml parsing functions interchangeably. I have fixed this in the patch below. I submitted also merge request to fix this issue here (https://gitlab.gnome.org/GNOME/libxml2/-/merge_requests/129).
diff --git a/HTMLparser.c b/HTMLparser.c index b56363a3..bf8268e5 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -6999,7 +6999,9 @@ htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi * @encoding: the document encoding, or NULL * @options: a combination of htmlParserOption(s) * - * parse an XML from a file descriptor and build a tree. + * parse an HTML from a file descriptor and build a tree. + * NOTE that the file descriptor will not be closed when the + * reader is closed or reset. * * Returns the resulting document tree */ @@ -7008,17 +7010,17 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options) { htmlParserCtxtPtr ctxt; xmlParserInputBufferPtr input; - xmlParserInputPtr stream; + htmlParserInputPtr stream; if (fd < 0) return (NULL); - xmlInitParser(); xmlInitParser(); input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); if (input == NULL) return (NULL); - ctxt = xmlNewParserCtxt(); + input->closecallback = NULL; + ctxt = htmlNewParserCtxt(); if (ctxt == NULL) { xmlFreeParserInputBuffer(input); return (NULL); @@ -7026,7 +7028,7 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options) stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); if (stream == NULL) { xmlFreeParserInputBuffer(input); - xmlFreeParserCtxt(ctxt); + htmlFreeParserCtxt(ctxt); return (NULL); } inputPush(ctxt, stream); _______________________________________________ xml mailing list, project page http://xmlsoft.org/ xml@gnome.org https://mail.gnome.org/mailman/listinfo/xml