The str() method of the std::stringstream class returns an object by
value, so the MemBufInputSource is looking at deleted memory, and the
parser will read garbage.
Try changing the code to do
//create an input source for the XML parser
std::string buff = sbuffer.str();
xercesc::MemBufInputSource XMLBuf((const XMLByte*)(buff.c_str()), buff.size(),
"InputXML");
Alberto
Il 26/03/14 16:56, tenspd137 . ha scritto:
Hi all -
I have a situation where I have to process bad XML - ie like the following:
(saved as test_metadata.xml)
<?xml version="1.0" encoding="utf-8"?>
<MyTest>
<Version>4354</Version>
</MyTest>
<RemoveMe>
<?xml version="1.0" encoding="utf-8"?>
<Metadata>
<Version>4356</Version>
<a>4928</a>
<b>6400</b>
</Metadata>
</RemoveMe>
it is like 2 XML files in one. What I am doing is reading the file into a
stringstream buffer, taking only the contents between the remove me tag,
and treting it as my memory buffer to be parsed. I keep getting invalid
multibyte sequence errors. I am running a UTF8 enabled system, and am
pretty sure that I can save UTF8 files. I have tried the second file I
made with emacs and printed it with DOMPrint:
saved as test2_metadata.xml
<?xml version="1.0" encoding="utf-8"?>
<MyTest>
<Version>4354</Version>
<Id>1</Id>
</MyTest>
but my own code still gives me an invalid multibyte sequence in the first
line.
My code is:
#include <fstream>
#include <string>
#include <sstream>
#include <iostream>
//stuff to parse XML
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/dom/DOM.hpp>
#include <xercesc/sax/HandlerBase.hpp>
#include <xercesc/framework/MemBufInputSource.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XercesDefs.hpp>
class XmlDomErrorHandler : public xercesc::HandlerBase
{
public:
void fatalError(const xercesc::SAXParseException &exc) {
printf("Fatal parsing error at line %d, col %d\n",
(int)exc.getLineNumber(), (int)exc.getColumnNumber());
char* msg = xercesc::XMLString::transcode( exc.getMessage() );
printf("%s\n", msg);
xercesc::XMLString::release( &msg );
exit(-1);
}
};
int main(int argc, char* argv[])
{
std::ifstream metadata(argv[1]);
std::string line;
std::string startLine("<RemoveMe>\n");
std::string stopLine("</RemoveMe>\n");
std::stringstream sbuffer;
xercesc::XercesDOMParser* parser;
xercesc::ErrorHandler* errorHandler;
try { xercesc::XMLPlatformUtils::Initialize(); }
catch (const xercesc::XMLException& toCatch)
{
char* message = xercesc::XMLString::transcode(toCatch.getMessage());
std::cout << "Error during Xerces initalization! :" << std::endl <<
message << std::endl;
xercesc::XMLString::release(&message);
return 1;
}
{
std::cout << "Init good..." << std::endl;
parser = new xercesc::XercesDOMParser();
errorHandler = (xercesc::ErrorHandler*) new XmlDomErrorHandler();
parser->setErrorHandler(errorHandler);
//int count = 0;
//while( line != stopLine )
//{
// std::getline(metadata, line);
//}
//read the reast of the file into a buffer
sbuffer << metadata.rdbuf();
metadata.close();
std::cout << "BEGIN BUFFER DUMP" << std::endl;
std::cout << sbuffer.str() << std::endl;
std::size_t start = sbuffer.str().find(startLine) + startLine.length();
std::size_t stop = sbuffer.str().find(stopLine);
std::size_t length = stop-start;
if (start != std::string::npos && stop != std::string::npos)
sbuffer.str(sbuffer.str().substr(start,length));
std::cout << "Start:" << start << std::endl << "Stop:" << stop <<
std::endl << "Length:" << length << std::endl;
std::cout << "Second Buffer Dump" << std::endl;
std::cout << sbuffer.str() << "END" << std::endl;
//create an input source for the XML parser
xercesc::MemBufInputSource XMLBuf((const
XMLByte*)(sbuffer.str().c_str()), sbuffer.str().size(), "InputXML");
//XMLBuf.setEncoding(xercesc::XMLString::transcode("LATIN1"));
parser->parse(XMLBuf);
std::cout << "Shouldn't make it here..." << std::endl;
xercesc::DOMElement* docRootNode;
xercesc::DOMDocument* doc;
xercesc::DOMNodeIterator* walker;
doc = parser->getDocument();
docRootNode = doc->getDocumentElement();
walker =
doc->createNodeIterator(docRootNode,xercesc::DOMNodeFilter::SHOW_ELEMENT,
NULL,true);
//Walk the XML
xercesc::DOMNode * CurrentNode = NULL;
std::string thisNodeName;
std::string parentNodeName;
for (CurrentNode = walker->nextNode(); CurrentNode !=0; CurrentNode =
walker->nextNode())
{
thisNodeName =
xercesc::XMLString::transcode(CurrentNode->getNodeName());
parentNodeName =
xercesc::XMLString::transcode(CurrentNode->getParentNode()->getNodeName());
std::cout << thisNodeName << std::endl;
std::cout << parentNodeName << std::endl;
}
}
xercesc::XMLPlatformUtils::Terminate();
return 0;
}
compiled with:
g++ test.cpp -o test `pkg-config xerces-c --cflags --libs`
Just run it with ./test <file>
Can anyone help me figure out what I am doing wrong? I know that
processing text in C++ can be tricky, and I am probably missing something
stupid, but it seems like this should be do-able.
Thanks for any help in advance.
-C