Nicolae Brinza has proposed merging lp:~nbrinza/zorba/parse-fragment into lp:zorba.
Requested reviews: Nicolae Brinza (nbrinza) Chris Hillery (ceejatec) Related bugs: Bug #1016606 in Zorba: "xml:parse unable to parse content containing a DOCTYPE" https://bugs.launchpad.net/zorba/+bug/1016606 Bug #1023170 in Zorba: "Segfault in xml:parse" https://bugs.launchpad.net/zorba/+bug/1023170 Bug #1024033 in Zorba: "segfault in parse-xml:parse()" https://bugs.launchpad.net/zorba/+bug/1024033 Bug #1027270 in Zorba: "xml:parse() - infinite loop" https://bugs.launchpad.net/zorba/+bug/1027270 Bug #1088886 in Zorba: "fn:parse-xml-fragment broken on 64 bit" https://bugs.launchpad.net/zorba/+bug/1088886 Bug #1099535 in Zorba: "xml:parse endless loop" https://bugs.launchpad.net/zorba/+bug/1099535 Bug #1099648 in Zorba: "XML parsing failures on Red Hat" https://bugs.launchpad.net/zorba/+bug/1099648 For more details, see: https://code.launchpad.net/~nbrinza/zorba/parse-fragment/+merge/144007 Fix for bugs #1099535 #1099648 and #1088886 -- https://code.launchpad.net/~nbrinza/zorba/parse-fragment/+merge/144007 Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'ChangeLog' --- ChangeLog 2013-01-14 09:46:44 +0000 +++ ChangeLog 2013-01-20 00:28:21 +0000 @@ -1,6 +1,14 @@ Zorba - The XQuery Processor +version 2.9 + +Bug Fixes/Other Changes: + * Fixed bug #1099648 and #1088886 (XML parsing failures on Red Hat) + * Fixed bug #1099535 (xml:parse endless loop) + + + version 2.8 New Features: === modified file 'src/store/naive/loader.h' --- src/store/naive/loader.h 2012-09-19 21:16:15 +0000 +++ src/store/naive/loader.h 2013-01-20 00:28:21 +0000 @@ -261,7 +261,8 @@ const FragmentIStream* getFragmentStream() const { return theFragmentStream; }; protected: - bool fillBuffer(FragmentIStream* theFragmentStream); + // returns true if the input buffer is not yet fully consumed + bool fillBuffer(FragmentIStream* theFragmentStream); unsigned long getCurrentInputOffset() const; === modified file 'src/store/naive/loader_dtd.cpp' --- src/store/naive/loader_dtd.cpp 2012-09-19 21:16:15 +0000 +++ src/store/naive/loader_dtd.cpp 2013-01-20 00:28:21 +0000 @@ -149,6 +149,7 @@ { } +// returns true if the input buffer is not yet fully consumed bool FragmentXmlLoader::fillBuffer(FragmentIStream* theFragmentStream) { if (theFragmentStream->ctxt->input->length > 0 && theFragmentStream->current_offset < theFragmentStream->bytes_in_buffer) @@ -175,6 +176,7 @@ theFragmentStream->ctxt->input->length = (theFragmentStream->bytes_in_buffer < (theFragmentStream->theBuffer.size()-1) ? theFragmentStream->bytes_in_buffer : (theFragmentStream->theBuffer.size()-1)); theFragmentStream->ctxt->input->cur = theFragmentStream->ctxt->input->base; theFragmentStream->ctxt->input->end = theFragmentStream->ctxt->input->base + theFragmentStream->ctxt->input->length; + theFragmentStream->ctxt->checkIndex = 0; // this needs to be reset to force LibXml2 to rescan the buffer. Otherwise it might fail to detect opening/closing tags in certain inputs if (theFragmentStream->bytes_in_buffer < theFragmentStream->theBuffer.size()-1) theFragmentStream->theBuffer[theFragmentStream->bytes_in_buffer] = 0; @@ -241,7 +243,9 @@ // Initialize the parser input (only filename and the pointer to the current char) theFragmentStream->theBuffer[0] = ' '; // This assignment is needed for LibXml2-2.7.6, which tries to read the buffer when xmlPushInput() is called - input->cur = (xmlChar*)(&theFragmentStream->theBuffer[0]); + input->base = (xmlChar*)(&theFragmentStream->theBuffer[0]); + input->cur = input->base; + // input->cur = (xmlChar*)(&theFragmentStream->theBuffer[0]); input->filename = (const char*)(xmlCanonicPath((const xmlChar*)theDocUri.c_str())); xmlPushInput(theFragmentStream->ctxt, input); } @@ -250,6 +254,8 @@ theFragmentStream->ctxt->disableSAX = false; // xmlStopParser() sets disableSAX to true theFragmentStream->parsed_nodes_count = 0; theFragmentStream->forced_parser_stop = false; + + // theFragmentStream->ctxt->progressive = 1; if (theFragmentStream->state != FragmentIStream::FRAGMENT_FIRST_START_DOC) { @@ -257,7 +263,8 @@ FragmentXmlLoader::startDocument(theFragmentStream->ctxt->userData); } - while ( ! theFragmentStream->forced_parser_stop && fillBuffer(theFragmentStream)) + bool buffer_not_consumed; + while ( ! theFragmentStream->forced_parser_stop && (buffer_not_consumed = fillBuffer(theFragmentStream))) { if (theFragmentStream->only_one_doc_node && theFragmentStream->state != FragmentIStream::FRAGMENT_FIRST_START_DOC) { @@ -316,9 +323,17 @@ } /* + std::string buffer = (char*)theFragmentStream->ctxt->input->cur; + if (theFragmentStream->ctxt->input->length < buffer.size()) + buffer = buffer.substr(0, theFragmentStream->ctxt->input->length); std::cerr << "\n==================\n--> skip_root: " << theFragmentStream->root_elements_to_skip << " current_depth: " << theFragmentStream->current_element_depth - << " state: " << theFragmentStream->ctxt->instate - << " about to parse: [" << theFragmentStream->ctxt->input->cur << "] " << std::endl; + << " state: " << theFragmentStream->ctxt->instate + << " about to parse: ["; + if (buffer.size() > 500) + std::cerr << buffer.substr(0, 160) << "\n...\n" << buffer.substr(buffer.size()-160); + else + std::cerr << theFragmentStream->ctxt->input->cur; + std::cerr << "] " << std::endl; */ xmlParseChunk(theFragmentStream->ctxt, (const char*)theFragmentStream->ctxt->input->cur, @@ -332,8 +347,10 @@ xmlParseCharData(theFragmentStream->ctxt, 0); theFragmentStream->current_offset = getCurrentInputOffset(); // update current offset - if (theXQueryDiagnostics->errors().empty() && theFragmentStream->current_offset == 0 && theFragmentStream->ctxt->checkIndex > 0) + if (theXQueryDiagnostics->errors().empty() && theFragmentStream->current_offset == 0) { + assert(buffer_not_consumed == true); + // we still haven't moved, double the buffer size theFragmentStream->theBuffer.resize((theFragmentStream->theBuffer.size()-1) * 2 + 1); theFragmentStream->ctxt->input->base = (xmlChar*)(&theFragmentStream->theBuffer[0]); === added file 'test/rbkt/ExpQueryResults/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xml.res' --- test/rbkt/ExpQueryResults/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xml.res 1970-01-01 00:00:00 +0000 +++ test/rbkt/ExpQueryResults/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xml.res 2013-01-20 00:28:21 +0000 @@ -0,0 +1,159 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<record xmlns="http://www.loc.gov/MARC21/slim"> + <leader>01428cam a2200313 a 4500</leader> + <controlfield tag="001">2</controlfield> + <controlfield tag="005">20060629073726.0</controlfield> + <controlfield tag="007">cr |||||||||||</controlfield> + <controlfield tag="008">990629s1900 maua 000 1 eng </controlfield> + <datafield tag="906" ind1=" " ind2=" "> + <subfield code="a">7</subfield> + <subfield code="b">ibc</subfield> + <subfield code="c">orignew</subfield> + <subfield code="d">u</subfield> + <subfield code="e">ocip</subfield> + <subfield code="f">19</subfield> + <subfield code="g">y-gencatlg</subfield> + </datafield> + <datafield tag="955" ind1=" " ind2=" "> + <subfield code="a">NEW INPUT vj36 06-29-99</subfield> + <subfield code="e">vj05 2002-04-03</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="9">(DLC) 00002266</subfield> + </datafield> + <datafield tag="010" ind1=" " ind2=" "> + <subfield code="a"> 00002266 </subfield> + </datafield> + <datafield tag="040" ind1=" " ind2=" "> + <subfield code="a">DLC</subfield> + <subfield code="c">DLC</subfield> + <subfield code="d">DLC</subfield> + </datafield> + <datafield tag="050" ind1="0" ind2="0"> + <subfield code="a">PZ3.L846</subfield> + <subfield code="b">So</subfield> + </datafield> + <datafield tag="051" ind1=" " ind2=" "> + <subfield code="a">PS3523.O46</subfield> + <subfield code="b">S72 1900</subfield> + </datafield> + <datafield tag="100" ind1="1" ind2=" "> + <subfield code="a">London, Jack,</subfield> + <subfield code="d">1876-1916.</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="4"> + <subfield code="a">The son of the wolf :</subfield> + <subfield code="b">tales of the far North /</subfield> + <subfield code="c">by Jack London.</subfield> + </datafield> + <datafield tag="246" ind1="3" ind2="0"> + <subfield code="a">Tales of the far North</subfield> + </datafield> + <datafield tag="260" ind1=" " ind2=" "> + <subfield code="a">Boston :</subfield> + <subfield code="b">Houghton, Mifflin,</subfield> + <subfield code="c">1900.</subfield> + </datafield> + <datafield tag="300" ind1=" " ind2=" "> + <subfield code="a">251 p. :</subfield> + <subfield code="b">1 ill. ;</subfield> + <subfield code="c">20 cm.</subfield> + </datafield> + <datafield tag="500" ind1=" " ind2=" "> + <subfield code="a">These tales appeared previously in the Overland monthly, 1899, and the Atlantic monthly.</subfield> + </datafield> + <datafield tag="505" ind1="0" ind2=" "> + <subfield code="a">The white silence -- The son of the wolf -- The men of Forty-Mile -- In a far country -- To the man on trail -- The priestly prerogative -- The wisdom of the trail -- The wife of a king -- An odyssey of the North.</subfield> + </datafield> + <datafield tag="530" ind1=" " ind2=" "> + <subfield code="a">Also available in digital form on the Library of Congress Web site.</subfield> + </datafield> + <datafield tag="856" ind1="4" ind2="1"> + <subfield code="d">mtfrb</subfield> + <subfield code="f">02266</subfield> + <subfield code="q">h</subfield> + <subfield code="u">http://hdl.loc.gov/loc.rbc/mtfrb.02266</subfield> + </datafield> + <datafield tag="859" ind1="4" ind2="2"> + <subfield code="3">Meeting of Frontiers: Siberia, Alaska, and the American West - "Rare Book Collections" Collection Description</subfield> + <subfield code="u">http://hdl.loc.gov/loc.eur/mtfhtml.0054</subfield> + </datafield> + <datafield tag="984" ind1=" " ind2=" "> + <subfield code="a">gsl</subfield> + </datafield> + <datafield tag="985" ind1=" " ind2=" "> + <subfield code="a">mtfront/mtfrb/tx</subfield> + <subfield code="e">intldl</subfield> + </datafield> + <datafield tag="985" ind1=" " ind2=" "> + <subfield code="a">pmpull</subfield> + <subfield code="e">intldl</subfield> + </datafield> +</record> +<record xmlns="http://www.loc.gov/MARC21/slim"> + <leader>00757cam a22002171 4500</leader> + <controlfield tag="001">3</controlfield> + <controlfield tag="005">20080606094929.0</controlfield> + <controlfield tag="008">980227s1900 iluc 000 0 eng </controlfield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="9">(DLC) 00004790</subfield> + </datafield> + <datafield tag="906" ind1=" " ind2=" "> + <subfield code="a">0</subfield> + <subfield code="b">ibc</subfield> + <subfield code="c">orignew</subfield> + <subfield code="d">3</subfield> + <subfield code="e">ocip</subfield> + <subfield code="f">19</subfield> + <subfield code="g">y-gencatlg</subfield> + </datafield> + <datafield tag="955" ind1=" " ind2=" "> + <subfield code="a">jd99 02-27-98</subfield> + </datafield> + <datafield tag="010" ind1=" " ind2=" "> + <subfield code="a"> 00004790 </subfield> + </datafield> + <datafield tag="040" ind1=" " ind2=" "> + <subfield code="a">DLC</subfield> + <subfield code="c">DLC</subfield> + </datafield> + <datafield tag="050" ind1="0" ind2="0"> + <subfield code="a">PG3385</subfield> + <subfield code="b">.S85</subfield> + </datafield> + <datafield tag="100" ind1="1" ind2=" "> + <subfield code="a">Stockham, Alice B.</subfield> + <subfield code="q">(Alice Bunker),</subfield> + <subfield code="d">1833-1912.</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="0"> + <subfield code="a">Tolstoi, a man of peace,</subfield> + <subfield code="c">by Alice B. Stockham. The new spirit by H. Havelock Ellis.</subfield> + </datafield> + <datafield tag="260" ind1=" " ind2=" "> + <subfield code="a">Chicago,</subfield> + <subfield code="b">A.B. Stockham,</subfield> + <subfield code="c">[1900]</subfield> + </datafield> + <datafield tag="300" ind1=" " ind2=" "> + <subfield code="a">140 p.</subfield> + <subfield code="b">ports.</subfield> + <subfield code="c">18 cm.</subfield> + </datafield> + <datafield tag="500" ind1=" " ind2=" "> + <subfield code="a">"The new spirit" (p. [85]-140) has special t.-p.</subfield> + </datafield> + <datafield tag="600" ind1="1" ind2="0"> + <subfield code="a">Tolstoy, Leo,</subfield> + <subfield code="c">graf,</subfield> + <subfield code="d">1828-1910.</subfield> + </datafield> + <datafield tag="700" ind1="1" ind2="2"> + <subfield code="a">Ellis, Havelock,</subfield> + <subfield code="d">1859-1939.</subfield> + <subfield code="t">New spirit.</subfield> + <subfield code="f">1900.</subfield> + </datafield> +</record> + === added file 'test/rbkt/Queries/zorba/parsing_and_serializing/bad.xml' --- test/rbkt/Queries/zorba/parsing_and_serializing/bad.xml 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/parsing_and_serializing/bad.xml 2013-01-20 00:28:21 +0000 @@ -0,0 +1,158 @@ +<collection xmlns="http://www.loc.gov/MARC21/slim"> +<record> + <leader>01428cam a2200313 a 4500</leader> + <controlfield tag="001">2</controlfield> + <controlfield tag="005">20060629073726.0</controlfield> + <controlfield tag="007">cr |||||||||||</controlfield> + <controlfield tag="008">990629s1900 maua 000 1 eng </controlfield> + <datafield tag="906" ind1=" " ind2=" "> + <subfield code="a">7</subfield> + <subfield code="b">ibc</subfield> + <subfield code="c">orignew</subfield> + <subfield code="d">u</subfield> + <subfield code="e">ocip</subfield> + <subfield code="f">19</subfield> + <subfield code="g">y-gencatlg</subfield> + </datafield> + <datafield tag="955" ind1=" " ind2=" "> + <subfield code="a">NEW INPUT vj36 06-29-99</subfield> + <subfield code="e">vj05 2002-04-03</subfield> + </datafield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="9">(DLC) 00002266</subfield> + </datafield> + <datafield tag="010" ind1=" " ind2=" "> + <subfield code="a"> 00002266 </subfield> + </datafield> + <datafield tag="040" ind1=" " ind2=" "> + <subfield code="a">DLC</subfield> + <subfield code="c">DLC</subfield> + <subfield code="d">DLC</subfield> + </datafield> + <datafield tag="050" ind1="0" ind2="0"> + <subfield code="a">PZ3.L846</subfield> + <subfield code="b">So</subfield> + </datafield> + <datafield tag="051" ind1=" " ind2=" "> + <subfield code="a">PS3523.O46</subfield> + <subfield code="b">S72 1900</subfield> + </datafield> + <datafield tag="100" ind1="1" ind2=" "> + <subfield code="a">London, Jack,</subfield> + <subfield code="d">1876-1916.</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="4"> + <subfield code="a">The son of the wolf :</subfield> + <subfield code="b">tales of the far North /</subfield> + <subfield code="c">by Jack London.</subfield> + </datafield> + <datafield tag="246" ind1="3" ind2="0"> + <subfield code="a">Tales of the far North</subfield> + </datafield> + <datafield tag="260" ind1=" " ind2=" "> + <subfield code="a">Boston :</subfield> + <subfield code="b">Houghton, Mifflin,</subfield> + <subfield code="c">1900.</subfield> + </datafield> + <datafield tag="300" ind1=" " ind2=" "> + <subfield code="a">251 p. :</subfield> + <subfield code="b">1 ill. ;</subfield> + <subfield code="c">20 cm.</subfield> + </datafield> + <datafield tag="500" ind1=" " ind2=" "> + <subfield code="a">These tales appeared previously in the Overland monthly, 1899, and the Atlantic monthly.</subfield> + </datafield> + <datafield tag="505" ind1="0" ind2=" "> + <subfield code="a">The white silence -- The son of the wolf -- The men of Forty-Mile -- In a far country -- To the man on trail -- The priestly prerogative -- The wisdom of the trail -- The wife of a king -- An odyssey of the North.</subfield> + </datafield> + <datafield tag="530" ind1=" " ind2=" "> + <subfield code="a">Also available in digital form on the Library of Congress Web site.</subfield> + </datafield> + <datafield tag="856" ind1="4" ind2="1"> + <subfield code="d">mtfrb</subfield> + <subfield code="f">02266</subfield> + <subfield code="q">h</subfield> + <subfield code="u">http://hdl.loc.gov/loc.rbc/mtfrb.02266</subfield> + </datafield> + <datafield tag="859" ind1="4" ind2="2"> + <subfield code="3">Meeting of Frontiers: Siberia, Alaska, and the American West - "Rare Book Collections" Collection Description</subfield> + <subfield code="u">http://hdl.loc.gov/loc.eur/mtfhtml.0054</subfield> + </datafield> + <datafield tag="984" ind1=" " ind2=" "> + <subfield code="a">gsl</subfield> + </datafield> + <datafield tag="985" ind1=" " ind2=" "> + <subfield code="a">mtfront/mtfrb/tx</subfield> + <subfield code="e">intldl</subfield> + </datafield> + <datafield tag="985" ind1=" " ind2=" "> + <subfield code="a">pmpull</subfield> + <subfield code="e">intldl</subfield> + </datafield> +</record> +<record> + <leader>00757cam a22002171 4500</leader> + <controlfield tag="001">3</controlfield> + <controlfield tag="005">20080606094929.0</controlfield> + <controlfield tag="008">980227s1900 iluc 000 0 eng </controlfield> + <datafield tag="035" ind1=" " ind2=" "> + <subfield code="9">(DLC) 00004790</subfield> + </datafield> + <datafield tag="906" ind1=" " ind2=" "> + <subfield code="a">0</subfield> + <subfield code="b">ibc</subfield> + <subfield code="c">orignew</subfield> + <subfield code="d">3</subfield> + <subfield code="e">ocip</subfield> + <subfield code="f">19</subfield> + <subfield code="g">y-gencatlg</subfield> + </datafield> + <datafield tag="955" ind1=" " ind2=" "> + <subfield code="a">jd99 02-27-98</subfield> + </datafield> + <datafield tag="010" ind1=" " ind2=" "> + <subfield code="a"> 00004790 </subfield> + </datafield> + <datafield tag="040" ind1=" " ind2=" "> + <subfield code="a">DLC</subfield> + <subfield code="c">DLC</subfield> + </datafield> + <datafield tag="050" ind1="0" ind2="0"> + <subfield code="a">PG3385</subfield> + <subfield code="b">.S85</subfield> + </datafield> + <datafield tag="100" ind1="1" ind2=" "> + <subfield code="a">Stockham, Alice B.</subfield> + <subfield code="q">(Alice Bunker),</subfield> + <subfield code="d">1833-1912.</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="0"> + <subfield code="a">Tolstoi, a man of peace,</subfield> + <subfield code="c">by Alice B. Stockham. The new spirit by H. Havelock Ellis.</subfield> + </datafield> + <datafield tag="260" ind1=" " ind2=" "> + <subfield code="a">Chicago,</subfield> + <subfield code="b">A.B. Stockham,</subfield> + <subfield code="c">[1900]</subfield> + </datafield> + <datafield tag="300" ind1=" " ind2=" "> + <subfield code="a">140 p.</subfield> + <subfield code="b">ports.</subfield> + <subfield code="c">18 cm.</subfield> + </datafield> + <datafield tag="500" ind1=" " ind2=" "> + <subfield code="a">"The new spirit" (p. [85]-140) has special t.-p.</subfield> + </datafield> + <datafield tag="600" ind1="1" ind2="0"> + <subfield code="a">Tolstoy, Leo,</subfield> + <subfield code="c">graf,</subfield> + <subfield code="d">1828-1910.</subfield> + </datafield> + <datafield tag="700" ind1="1" ind2="2"> + <subfield code="a">Ellis, Havelock,</subfield> + <subfield code="d">1859-1939.</subfield> + <subfield code="t">New spirit.</subfield> + <subfield code="f">1900.</subfield> + </datafield> +</record> +</collection> === added file 'test/rbkt/Queries/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xq' --- test/rbkt/Queries/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xq 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xq 2013-01-20 00:28:21 +0000 @@ -0,0 +1,13 @@ +import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml"; +import schema namespace opt = "http://www.zorba-xquery.com/modules/xml-options"; +import module namespace fetch = "http://www.zorba-xquery.com/modules/fetch"; + +variable $xmlcontents := fetch:content(resolve-uri("bad.xml")); + +let $contents := parse-xml:parse( + $xmlcontents, + <opt:options> + <opt:parse-external-parsed-entity opt:skip-root-nodes="1"/> + </opt:options>) + +return $contents
-- Mailing list: https://launchpad.net/~zorba-coders Post to : zorba-coders@lists.launchpad.net Unsubscribe : https://launchpad.net/~zorba-coders More help : https://help.launchpad.net/ListHelp