The str() method of the std::stringstream class returns an object by value, so the MemBufInputSource is looking at deleted memory, and the parser will read garbage.
Try changing the code to do

    //create an input source for the XML parser
    std::string buff = sbuffer.str();
    xercesc::MemBufInputSource XMLBuf((const XMLByte*)(buff.c_str()), buff.size(), 
"InputXML");

Alberto

Il 26/03/14 16:56, tenspd137 . ha scritto:
Hi all -

I have a situation where I have to process bad XML - ie like the following:
  (saved as test_metadata.xml)

<?xml version="1.0" encoding="utf-8"?>
<MyTest>
<Version>4354</Version>
</MyTest>
<RemoveMe>
<?xml version="1.0" encoding="utf-8"?>
<Metadata>
<Version>4356</Version>
<a>4928</a>
<b>6400</b>
</Metadata>
</RemoveMe>

it is like 2 XML files in one.  What I am doing is reading the file into a
stringstream buffer, taking only the contents between the remove me tag,
and treting it as my memory buffer to be parsed.  I keep getting invalid
multibyte sequence errors.  I am running a UTF8 enabled system, and am
pretty sure that I can save UTF8 files.  I have tried the second file I
made with emacs and printed it with DOMPrint:

saved as test2_metadata.xml

<?xml version="1.0" encoding="utf-8"?>
<MyTest>
<Version>4354</Version>
<Id>1</Id>
</MyTest>

but my own code still gives me an invalid multibyte sequence in the first
line.

My code is:

#include <fstream>
#include <string>
#include <sstream>
#include <iostream>

//stuff to parse XML
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/dom/DOM.hpp>
#include <xercesc/sax/HandlerBase.hpp>
#include <xercesc/framework/MemBufInputSource.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XercesDefs.hpp>

class XmlDomErrorHandler : public xercesc::HandlerBase
{
   public:
     void fatalError(const xercesc::SAXParseException &exc) {
         printf("Fatal parsing error at line %d, col %d\n",
(int)exc.getLineNumber(), (int)exc.getColumnNumber());
         char* msg = xercesc::XMLString::transcode( exc.getMessage() );
     printf("%s\n", msg);
         xercesc::XMLString::release( &msg );
     exit(-1);
     }
};

int main(int argc, char* argv[])
{
     std::ifstream metadata(argv[1]);
     std::string line;
     std::string startLine("<RemoveMe>\n");
     std::string stopLine("</RemoveMe>\n");
     std::stringstream sbuffer;
     xercesc::XercesDOMParser* parser;
     xercesc::ErrorHandler* errorHandler;

     try { xercesc::XMLPlatformUtils::Initialize(); }
     catch (const xercesc::XMLException& toCatch)
     {
     char* message = xercesc::XMLString::transcode(toCatch.getMessage());
     std::cout << "Error during Xerces initalization! :" << std::endl <<
message << std::endl;
     xercesc::XMLString::release(&message);
     return 1;
     }

     {
     std::cout << "Init good..." << std::endl;

     parser = new xercesc::XercesDOMParser();
     errorHandler = (xercesc::ErrorHandler*) new XmlDomErrorHandler();
     parser->setErrorHandler(errorHandler);
     //int count = 0;
     //while( line != stopLine )
     //{
     //    std::getline(metadata, line);
     //}


     //read the reast of the file into a buffer

     sbuffer << metadata.rdbuf();

     metadata.close();

     std::cout << "BEGIN BUFFER DUMP" << std::endl;

     std::cout << sbuffer.str() << std::endl;

     std::size_t start = sbuffer.str().find(startLine) + startLine.length();
     std::size_t stop  = sbuffer.str().find(stopLine);
     std::size_t length = stop-start;


     if (start != std::string::npos && stop != std::string::npos)
         sbuffer.str(sbuffer.str().substr(start,length));

     std::cout << "Start:" << start << std::endl << "Stop:" << stop <<
std::endl << "Length:" << length << std::endl;
     std::cout << "Second Buffer Dump" << std::endl;
     std::cout << sbuffer.str() << "END" << std::endl;

     //create an input source for the XML parser
     xercesc::MemBufInputSource XMLBuf((const
XMLByte*)(sbuffer.str().c_str()), sbuffer.str().size(), "InputXML");


     //XMLBuf.setEncoding(xercesc::XMLString::transcode("LATIN1"));

     parser->parse(XMLBuf);

     std::cout << "Shouldn't make it here..." << std::endl;
     xercesc::DOMElement* docRootNode;
     xercesc::DOMDocument* doc;
     xercesc::DOMNodeIterator* walker;

     doc = parser->getDocument();
     docRootNode = doc->getDocumentElement();

     walker =
doc->createNodeIterator(docRootNode,xercesc::DOMNodeFilter::SHOW_ELEMENT,
NULL,true);

     //Walk the XML
     xercesc::DOMNode * CurrentNode = NULL;
     std::string thisNodeName;
     std::string parentNodeName;

     for (CurrentNode = walker->nextNode(); CurrentNode !=0; CurrentNode =
walker->nextNode())
     {
         thisNodeName =
xercesc::XMLString::transcode(CurrentNode->getNodeName());
         parentNodeName =
xercesc::XMLString::transcode(CurrentNode->getParentNode()->getNodeName());

         std::cout << thisNodeName << std::endl;
         std::cout << parentNodeName << std::endl;
     }
     }

     xercesc::XMLPlatformUtils::Terminate();

     return 0;

}

compiled with:

g++ test.cpp -o test `pkg-config xerces-c --cflags --libs`

Just run it with ./test <file>

Can anyone help me figure out what I am doing wrong?  I know that
processing text in C++ can be tricky, and I am probably missing something
stupid, but it seems like this should be do-able.

Thanks for any help in advance.

-C


Reply via email to