Nicolae Brinza has proposed merging lp:~nbrinza/zorba/parse-fragment into 
lp:zorba.

Commit message:
Fix for bug #1099535 endless loop in xml:parse()

Requested reviews:
  Chris Hillery (ceejatec)
Related bugs:
  Bug #1016606 in Zorba: "xml:parse unable to parse content containing a 
DOCTYPE"
  https://bugs.launchpad.net/zorba/+bug/1016606
  Bug #1023170 in Zorba: "Segfault in xml:parse"
  https://bugs.launchpad.net/zorba/+bug/1023170
  Bug #1024033 in Zorba: "segfault in parse-xml:parse()"
  https://bugs.launchpad.net/zorba/+bug/1024033
  Bug #1027270 in Zorba: "xml:parse() - infinite loop"
  https://bugs.launchpad.net/zorba/+bug/1027270
  Bug #1099535 in Zorba: "xml:parse endless loop"
  https://bugs.launchpad.net/zorba/+bug/1099535

For more details, see:
https://code.launchpad.net/~nbrinza/zorba/parse-fragment/+merge/143519

Fix for bug #1099535 endless loop in xml:parse()
-- 
https://code.launchpad.net/~nbrinza/zorba/parse-fragment/+merge/143519
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'ChangeLog'
--- ChangeLog	2013-01-14 09:46:44 +0000
+++ ChangeLog	2013-01-16 14:40:27 +0000
@@ -1,6 +1,13 @@
 Zorba - The XQuery Processor
 
 
+version 2.9
+
+Bug Fixes/Other Changes:
+  * Fixed bug #1099535  (xml:parse endless loop)
+
+
+
 version 2.8
 
 New Features:

=== modified file 'src/store/naive/loader.h'
--- src/store/naive/loader.h	2012-09-19 21:16:15 +0000
+++ src/store/naive/loader.h	2013-01-16 14:40:27 +0000
@@ -261,7 +261,8 @@
   const FragmentIStream* getFragmentStream() const { return theFragmentStream; };
   
 protected:
-  bool fillBuffer(FragmentIStream* theFragmentStream);
+  // returns true if the input buffer is not yet fully consumed
+  bool fillBuffer(FragmentIStream* theFragmentStream);  
 
   unsigned long getCurrentInputOffset() const;
 

=== modified file 'src/store/naive/loader_dtd.cpp'
--- src/store/naive/loader_dtd.cpp	2012-09-19 21:16:15 +0000
+++ src/store/naive/loader_dtd.cpp	2013-01-16 14:40:27 +0000
@@ -149,6 +149,7 @@
 {
 }
 
+// returns true if the input buffer is not yet fully consumed
 bool FragmentXmlLoader::fillBuffer(FragmentIStream* theFragmentStream)
 {
   if (theFragmentStream->ctxt->input->length > 0 && theFragmentStream->current_offset < theFragmentStream->bytes_in_buffer)
@@ -175,6 +176,7 @@
   theFragmentStream->ctxt->input->length = (theFragmentStream->bytes_in_buffer < (theFragmentStream->theBuffer.size()-1) ? theFragmentStream->bytes_in_buffer : (theFragmentStream->theBuffer.size()-1));
   theFragmentStream->ctxt->input->cur = theFragmentStream->ctxt->input->base;
   theFragmentStream->ctxt->input->end = theFragmentStream->ctxt->input->base + theFragmentStream->ctxt->input->length;
+  theFragmentStream->ctxt->checkIndex = 0; // this needs to be reset to force LibXml2 to rescan the buffer. Otherwise it might fail to detect opening/closing tags in certain inputs
   
   if (theFragmentStream->bytes_in_buffer < theFragmentStream->theBuffer.size()-1)
     theFragmentStream->theBuffer[theFragmentStream->bytes_in_buffer] = 0;
@@ -250,6 +252,8 @@
     theFragmentStream->ctxt->disableSAX = false; // xmlStopParser() sets disableSAX to true
     theFragmentStream->parsed_nodes_count = 0;
     theFragmentStream->forced_parser_stop = false;
+    
+    // theFragmentStream->ctxt->progressive = 1;
 
     if (theFragmentStream->state != FragmentIStream::FRAGMENT_FIRST_START_DOC)
     {
@@ -257,7 +261,8 @@
       FragmentXmlLoader::startDocument(theFragmentStream->ctxt->userData);
     }
 
-    while ( ! theFragmentStream->forced_parser_stop && fillBuffer(theFragmentStream))
+    bool buffer_not_consumed;
+    while ( ! theFragmentStream->forced_parser_stop && (buffer_not_consumed = fillBuffer(theFragmentStream)))
     {
       if (theFragmentStream->only_one_doc_node && theFragmentStream->state != FragmentIStream::FRAGMENT_FIRST_START_DOC)
       {
@@ -316,9 +321,15 @@
       }
       
       /*
+      std::string buffer = (char*)theFragmentStream->ctxt->input->cur;
       std::cerr << "\n==================\n--> skip_root: " << theFragmentStream->root_elements_to_skip << " current_depth: " << theFragmentStream->current_element_depth 
-          << " state: " << theFragmentStream->ctxt->instate 
-          << " about to parse: [" << theFragmentStream->ctxt->input->cur << "] " << std::endl;
+          << " state: " << theFragmentStream->ctxt->instate
+          << " about to parse: [";
+      if (buffer.size() > 500)
+        std::cerr << buffer.substr(0, 160) << "\n...\n" << buffer.substr(buffer.size()-160);
+      else
+        std::cerr << theFragmentStream->ctxt->input->cur;
+      std::cerr << "] " << std::endl;
       */
       
       xmlParseChunk(theFragmentStream->ctxt, (const char*)theFragmentStream->ctxt->input->cur,
@@ -334,6 +345,8 @@
         
         if (theXQueryDiagnostics->errors().empty() && theFragmentStream->current_offset == 0 && theFragmentStream->ctxt->checkIndex > 0)
         {
+          assert(buffer_not_consumed == true);
+          
           // we still haven't moved, double the buffer size
           theFragmentStream->theBuffer.resize((theFragmentStream->theBuffer.size()-1) * 2 + 1);
           theFragmentStream->ctxt->input->base = (xmlChar*)(&theFragmentStream->theBuffer[0]);

=== added file 'test/rbkt/ExpQueryResults/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xml.res'
--- test/rbkt/ExpQueryResults/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xml.res	1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xml.res	2013-01-16 14:40:27 +0000
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<record xmlns="http://www.loc.gov/MARC21/slim";>
+  <leader>01428cam a2200313 a 4500</leader>
+  <controlfield tag="001">2</controlfield>
+  <controlfield tag="005">20060629073726.0</controlfield>
+  <controlfield tag="007">cr |||||||||||</controlfield>
+  <controlfield tag="008">990629s1900    maua          000 1 eng  </controlfield>
+  <datafield tag="906" ind1=" " ind2=" ">
+    <subfield code="a">7</subfield>
+    <subfield code="b">ibc</subfield>
+    <subfield code="c">orignew</subfield>
+    <subfield code="d">u</subfield>
+    <subfield code="e">ocip</subfield>
+    <subfield code="f">19</subfield>
+    <subfield code="g">y-gencatlg</subfield>
+  </datafield>
+  <datafield tag="955" ind1=" " ind2=" ">
+    <subfield code="a">NEW INPUT vj36 06-29-99</subfield>
+    <subfield code="e">vj05 2002-04-03</subfield>
+  </datafield>
+  <datafield tag="035" ind1=" " ind2=" ">
+    <subfield code="9">(DLC)   00002266</subfield>
+  </datafield>
+  <datafield tag="010" ind1=" " ind2=" ">
+    <subfield code="a">   00002266 </subfield>
+  </datafield>
+  <datafield tag="040" ind1=" " ind2=" ">
+    <subfield code="a">DLC</subfield>
+    <subfield code="c">DLC</subfield>
+    <subfield code="d">DLC</subfield>
+  </datafield>
+  <datafield tag="050" ind1="0" ind2="0">
+    <subfield code="a">PZ3.L846</subfield>
+    <subfield code="b">So</subfield>
+  </datafield>
+  <datafield tag="051" ind1=" " ind2=" ">
+    <subfield code="a">PS3523.O46</subfield>
+    <subfield code="b">S72 1900</subfield>
+  </datafield>
+  <datafield tag="100" ind1="1" ind2=" ">
+    <subfield code="a">London, Jack,</subfield>
+    <subfield code="d">1876-1916.</subfield>
+  </datafield>
+  <datafield tag="245" ind1="1" ind2="4">
+    <subfield code="a">The son of the wolf :</subfield>
+    <subfield code="b">tales of the far North /</subfield>
+    <subfield code="c">by Jack London.</subfield>
+  </datafield>
+  <datafield tag="246" ind1="3" ind2="0">
+    <subfield code="a">Tales of the far North</subfield>
+  </datafield>
+  <datafield tag="260" ind1=" " ind2=" ">
+    <subfield code="a">Boston :</subfield>
+    <subfield code="b">Houghton, Mifflin,</subfield>
+    <subfield code="c">1900.</subfield>
+  </datafield>
+  <datafield tag="300" ind1=" " ind2=" ">
+    <subfield code="a">251 p. :</subfield>
+    <subfield code="b">1 ill. ;</subfield>
+    <subfield code="c">20 cm.</subfield>
+  </datafield>
+  <datafield tag="500" ind1=" " ind2=" ">
+    <subfield code="a">These tales appeared previously in the Overland monthly, 1899, and the Atlantic monthly.</subfield>
+  </datafield>
+  <datafield tag="505" ind1="0" ind2=" ">
+    <subfield code="a">The white silence -- The son of the wolf -- The men of Forty-Mile -- In a far country -- To the man on trail -- The priestly prerogative -- The wisdom of the trail -- The wife of a king -- An odyssey of the North.</subfield>
+  </datafield>
+  <datafield tag="530" ind1=" " ind2=" ">
+    <subfield code="a">Also available in digital form on the Library of Congress Web site.</subfield>
+  </datafield>
+  <datafield tag="856" ind1="4" ind2="1">
+    <subfield code="d">mtfrb</subfield>
+    <subfield code="f">02266</subfield>
+    <subfield code="q">h</subfield>
+    <subfield code="u">http://hdl.loc.gov/loc.rbc/mtfrb.02266</subfield>
+  </datafield>
+  <datafield tag="859" ind1="4" ind2="2">
+    <subfield code="3">Meeting of Frontiers: Siberia, Alaska, and the American West - "Rare Book Collections" Collection Description</subfield>
+    <subfield code="u">http://hdl.loc.gov/loc.eur/mtfhtml.0054</subfield>
+  </datafield>
+  <datafield tag="984" ind1=" " ind2=" ">
+    <subfield code="a">gsl</subfield>
+  </datafield>
+  <datafield tag="985" ind1=" " ind2=" ">
+    <subfield code="a">mtfront/mtfrb/tx</subfield>
+    <subfield code="e">intldl</subfield>
+  </datafield>
+  <datafield tag="985" ind1=" " ind2=" ">
+    <subfield code="a">pmpull</subfield>
+    <subfield code="e">intldl</subfield>
+  </datafield>
+</record>
+<record xmlns="http://www.loc.gov/MARC21/slim";>
+  <leader>00757cam a22002171  4500</leader>
+  <controlfield tag="001">3</controlfield>
+  <controlfield tag="005">20080606094929.0</controlfield>
+  <controlfield tag="008">980227s1900    iluc          000 0 eng  </controlfield>
+  <datafield tag="035" ind1=" " ind2=" ">
+    <subfield code="9">(DLC)   00004790</subfield>
+  </datafield>
+  <datafield tag="906" ind1=" " ind2=" ">
+    <subfield code="a">0</subfield>
+    <subfield code="b">ibc</subfield>
+    <subfield code="c">orignew</subfield>
+    <subfield code="d">3</subfield>
+    <subfield code="e">ocip</subfield>
+    <subfield code="f">19</subfield>
+    <subfield code="g">y-gencatlg</subfield>
+  </datafield>
+  <datafield tag="955" ind1=" " ind2=" ">
+    <subfield code="a">jd99 02-27-98</subfield>
+  </datafield>
+  <datafield tag="010" ind1=" " ind2=" ">
+    <subfield code="a">   00004790 </subfield>
+  </datafield>
+  <datafield tag="040" ind1=" " ind2=" ">
+    <subfield code="a">DLC</subfield>
+    <subfield code="c">DLC</subfield>
+  </datafield>
+  <datafield tag="050" ind1="0" ind2="0">
+    <subfield code="a">PG3385</subfield>
+    <subfield code="b">.S85</subfield>
+  </datafield>
+  <datafield tag="100" ind1="1" ind2=" ">
+    <subfield code="a">Stockham, Alice B.</subfield>
+    <subfield code="q">(Alice Bunker),</subfield>
+    <subfield code="d">1833-1912.</subfield>
+  </datafield>
+  <datafield tag="245" ind1="1" ind2="0">
+    <subfield code="a">Tolstoi, a man of peace,</subfield>
+    <subfield code="c">by Alice B. Stockham. The new spirit by H. Havelock Ellis.</subfield>
+  </datafield>
+  <datafield tag="260" ind1=" " ind2=" ">
+    <subfield code="a">Chicago,</subfield>
+    <subfield code="b">A.B. Stockham,</subfield>
+    <subfield code="c">[1900]</subfield>
+  </datafield>
+  <datafield tag="300" ind1=" " ind2=" ">
+    <subfield code="a">140 p.</subfield>
+    <subfield code="b">ports.</subfield>
+    <subfield code="c">18 cm.</subfield>
+  </datafield>
+  <datafield tag="500" ind1=" " ind2=" ">
+    <subfield code="a">"The new spirit" (p. [85]-140) has special t.-p.</subfield>
+  </datafield>
+  <datafield tag="600" ind1="1" ind2="0">
+    <subfield code="a">Tolstoy, Leo,</subfield>
+    <subfield code="c">graf,</subfield>
+    <subfield code="d">1828-1910.</subfield>
+  </datafield>
+  <datafield tag="700" ind1="1" ind2="2">
+    <subfield code="a">Ellis, Havelock,</subfield>
+    <subfield code="d">1859-1939.</subfield>
+    <subfield code="t">New spirit.</subfield>
+    <subfield code="f">1900.</subfield>
+  </datafield>
+</record>
+

=== added file 'test/rbkt/Queries/zorba/parsing_and_serializing/bad.xml'
--- test/rbkt/Queries/zorba/parsing_and_serializing/bad.xml	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/parsing_and_serializing/bad.xml	2013-01-16 14:40:27 +0000
@@ -0,0 +1,158 @@
+<collection xmlns="http://www.loc.gov/MARC21/slim";>
+<record>
+  <leader>01428cam a2200313 a 4500</leader>
+  <controlfield tag="001">2</controlfield>
+  <controlfield tag="005">20060629073726.0</controlfield>
+  <controlfield tag="007">cr |||||||||||</controlfield>
+  <controlfield tag="008">990629s1900    maua          000 1 eng  </controlfield>
+  <datafield tag="906" ind1=" " ind2=" ">
+    <subfield code="a">7</subfield>
+    <subfield code="b">ibc</subfield>
+    <subfield code="c">orignew</subfield>
+    <subfield code="d">u</subfield>
+    <subfield code="e">ocip</subfield>
+    <subfield code="f">19</subfield>
+    <subfield code="g">y-gencatlg</subfield>
+  </datafield>
+  <datafield tag="955" ind1=" " ind2=" ">
+    <subfield code="a">NEW INPUT vj36 06-29-99</subfield>
+    <subfield code="e">vj05 2002-04-03</subfield>
+  </datafield>
+  <datafield tag="035" ind1=" " ind2=" ">
+    <subfield code="9">(DLC)   00002266</subfield>
+  </datafield>
+  <datafield tag="010" ind1=" " ind2=" ">
+    <subfield code="a">   00002266 </subfield>
+  </datafield>
+  <datafield tag="040" ind1=" " ind2=" ">
+    <subfield code="a">DLC</subfield>
+    <subfield code="c">DLC</subfield>
+    <subfield code="d">DLC</subfield>
+  </datafield>
+  <datafield tag="050" ind1="0" ind2="0">
+    <subfield code="a">PZ3.L846</subfield>
+    <subfield code="b">So</subfield>
+  </datafield>
+  <datafield tag="051" ind1=" " ind2=" ">
+    <subfield code="a">PS3523.O46</subfield>
+    <subfield code="b">S72 1900</subfield>
+  </datafield>
+  <datafield tag="100" ind1="1" ind2=" ">
+    <subfield code="a">London, Jack,</subfield>
+    <subfield code="d">1876-1916.</subfield>
+  </datafield>
+  <datafield tag="245" ind1="1" ind2="4">
+    <subfield code="a">The son of the wolf :</subfield>
+    <subfield code="b">tales of the far North /</subfield>
+    <subfield code="c">by Jack London.</subfield>
+  </datafield>
+  <datafield tag="246" ind1="3" ind2="0">
+    <subfield code="a">Tales of the far North</subfield>
+  </datafield>
+  <datafield tag="260" ind1=" " ind2=" ">
+    <subfield code="a">Boston :</subfield>
+    <subfield code="b">Houghton, Mifflin,</subfield>
+    <subfield code="c">1900.</subfield>
+  </datafield>
+  <datafield tag="300" ind1=" " ind2=" ">
+    <subfield code="a">251 p. :</subfield>
+    <subfield code="b">1 ill. ;</subfield>
+    <subfield code="c">20 cm.</subfield>
+  </datafield>
+  <datafield tag="500" ind1=" " ind2=" ">
+    <subfield code="a">These tales appeared previously in the Overland monthly, 1899, and the Atlantic monthly.</subfield>
+  </datafield>
+  <datafield tag="505" ind1="0" ind2=" ">
+    <subfield code="a">The white silence -- The son of the wolf -- The men of Forty-Mile -- In a far country -- To the man on trail -- The priestly prerogative -- The wisdom of the trail -- The wife of a king -- An odyssey of the North.</subfield>
+  </datafield>
+  <datafield tag="530" ind1=" " ind2=" ">
+    <subfield code="a">Also available in digital form on the Library of Congress Web site.</subfield>
+  </datafield>
+  <datafield tag="856" ind1="4" ind2="1">
+    <subfield code="d">mtfrb</subfield>
+    <subfield code="f">02266</subfield>
+    <subfield code="q">h</subfield>
+    <subfield code="u">http://hdl.loc.gov/loc.rbc/mtfrb.02266</subfield>
+  </datafield>
+  <datafield tag="859" ind1="4" ind2="2">
+    <subfield code="3">Meeting of Frontiers: Siberia, Alaska, and the American West - &quot;Rare Book Collections&quot; Collection Description</subfield>
+    <subfield code="u">http://hdl.loc.gov/loc.eur/mtfhtml.0054</subfield>
+  </datafield>
+  <datafield tag="984" ind1=" " ind2=" ">
+    <subfield code="a">gsl</subfield>
+  </datafield>
+  <datafield tag="985" ind1=" " ind2=" ">
+    <subfield code="a">mtfront/mtfrb/tx</subfield>
+    <subfield code="e">intldl</subfield>
+  </datafield>
+  <datafield tag="985" ind1=" " ind2=" ">
+    <subfield code="a">pmpull</subfield>
+    <subfield code="e">intldl</subfield>
+  </datafield>
+</record>
+<record>
+  <leader>00757cam a22002171  4500</leader>
+  <controlfield tag="001">3</controlfield>
+  <controlfield tag="005">20080606094929.0</controlfield>
+  <controlfield tag="008">980227s1900    iluc          000 0 eng  </controlfield>
+  <datafield tag="035" ind1=" " ind2=" ">
+    <subfield code="9">(DLC)   00004790</subfield>
+  </datafield>
+  <datafield tag="906" ind1=" " ind2=" ">
+    <subfield code="a">0</subfield>
+    <subfield code="b">ibc</subfield>
+    <subfield code="c">orignew</subfield>
+    <subfield code="d">3</subfield>
+    <subfield code="e">ocip</subfield>
+    <subfield code="f">19</subfield>
+    <subfield code="g">y-gencatlg</subfield>
+  </datafield>
+  <datafield tag="955" ind1=" " ind2=" ">
+    <subfield code="a">jd99 02-27-98</subfield>
+  </datafield>
+  <datafield tag="010" ind1=" " ind2=" ">
+    <subfield code="a">   00004790 </subfield>
+  </datafield>
+  <datafield tag="040" ind1=" " ind2=" ">
+    <subfield code="a">DLC</subfield>
+    <subfield code="c">DLC</subfield>
+  </datafield>
+  <datafield tag="050" ind1="0" ind2="0">
+    <subfield code="a">PG3385</subfield>
+    <subfield code="b">.S85</subfield>
+  </datafield>
+  <datafield tag="100" ind1="1" ind2=" ">
+    <subfield code="a">Stockham, Alice B.</subfield>
+    <subfield code="q">(Alice Bunker),</subfield>
+    <subfield code="d">1833-1912.</subfield>
+  </datafield>
+  <datafield tag="245" ind1="1" ind2="0">
+    <subfield code="a">Tolstoi, a man of peace,</subfield>
+    <subfield code="c">by Alice B. Stockham. The new spirit by H. Havelock Ellis.</subfield>
+  </datafield>
+  <datafield tag="260" ind1=" " ind2=" ">
+    <subfield code="a">Chicago,</subfield>
+    <subfield code="b">A.B. Stockham,</subfield>
+    <subfield code="c">[1900]</subfield>
+  </datafield>
+  <datafield tag="300" ind1=" " ind2=" ">
+    <subfield code="a">140 p.</subfield>
+    <subfield code="b">ports.</subfield>
+    <subfield code="c">18 cm.</subfield>
+  </datafield>
+  <datafield tag="500" ind1=" " ind2=" ">
+    <subfield code="a">&quot;The new spirit&quot; (p. [85]-140) has special t.-p.</subfield>
+  </datafield>
+  <datafield tag="600" ind1="1" ind2="0">
+    <subfield code="a">Tolstoy, Leo,</subfield>
+    <subfield code="c">graf,</subfield>
+    <subfield code="d">1828-1910.</subfield>
+  </datafield>
+  <datafield tag="700" ind1="1" ind2="2">
+    <subfield code="a">Ellis, Havelock,</subfield>
+    <subfield code="d">1859-1939.</subfield>
+    <subfield code="t">New spirit.</subfield>
+    <subfield code="f">1900.</subfield>
+  </datafield>
+</record>
+</collection>

=== added file 'test/rbkt/Queries/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xq'
--- test/rbkt/Queries/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xq	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/parsing_and_serializing/parse-fragment-skip-root-57.xq	2013-01-16 14:40:27 +0000
@@ -0,0 +1,13 @@
+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";;
+import schema namespace opt = "http://www.zorba-xquery.com/modules/xml-options";;
+import module namespace fetch = "http://www.zorba-xquery.com/modules/fetch";;
+
+variable $xmlcontents := fetch:content(resolve-uri("bad.xml"));
+
+let $contents := parse-xml:parse(
+      $xmlcontents,
+      <opt:options>
+        <opt:parse-external-parsed-entity opt:skip-root-nodes="1"/>
+      </opt:options>)
+
+return $contents

-- 
Mailing list: https://launchpad.net/~zorba-coders
Post to     : zorba-coders@lists.launchpad.net
Unsubscribe : https://launchpad.net/~zorba-coders
More help   : https://help.launchpad.net/ListHelp

Reply via email to