Hi all - I have a situation where I have to process bad XML - ie like the following: (saved as test_metadata.xml)
<?xml version="1.0" encoding="utf-8"?> <MyTest> <Version>4354</Version> </MyTest> <RemoveMe> <?xml version="1.0" encoding="utf-8"?> <Metadata> <Version>4356</Version> <a>4928</a> <b>6400</b> </Metadata> </RemoveMe> it is like 2 XML files in one. What I am doing is reading the file into a stringstream buffer, taking only the contents between the remove me tag, and treting it as my memory buffer to be parsed. I keep getting invalid multibyte sequence errors. I am running a UTF8 enabled system, and am pretty sure that I can save UTF8 files. I have tried the second file I made with emacs and printed it with DOMPrint: saved as test2_metadata.xml <?xml version="1.0" encoding="utf-8"?> <MyTest> <Version>4354</Version> <Id>1</Id> </MyTest> but my own code still gives me an invalid multibyte sequence in the first line. My code is: #include <fstream> #include <string> #include <sstream> #include <iostream> //stuff to parse XML #include <xercesc/parsers/XercesDOMParser.hpp> #include <xercesc/dom/DOM.hpp> #include <xercesc/sax/HandlerBase.hpp> #include <xercesc/framework/MemBufInputSource.hpp> #include <xercesc/util/XMLString.hpp> #include <xercesc/util/PlatformUtils.hpp> #include <xercesc/util/XercesDefs.hpp> class XmlDomErrorHandler : public xercesc::HandlerBase { public: void fatalError(const xercesc::SAXParseException &exc) { printf("Fatal parsing error at line %d, col %d\n", (int)exc.getLineNumber(), (int)exc.getColumnNumber()); char* msg = xercesc::XMLString::transcode( exc.getMessage() ); printf("%s\n", msg); xercesc::XMLString::release( &msg ); exit(-1); } }; int main(int argc, char* argv[]) { std::ifstream metadata(argv[1]); std::string line; std::string startLine("<RemoveMe>\n"); std::string stopLine("</RemoveMe>\n"); std::stringstream sbuffer; xercesc::XercesDOMParser* parser; xercesc::ErrorHandler* errorHandler; try { xercesc::XMLPlatformUtils::Initialize(); } catch (const xercesc::XMLException& toCatch) { char* message = xercesc::XMLString::transcode(toCatch.getMessage()); std::cout << "Error during Xerces initalization! :" << std::endl << message << std::endl; xercesc::XMLString::release(&message); return 1; } { std::cout << "Init good..." << std::endl; parser = new xercesc::XercesDOMParser(); errorHandler = (xercesc::ErrorHandler*) new XmlDomErrorHandler(); parser->setErrorHandler(errorHandler); //int count = 0; //while( line != stopLine ) //{ // std::getline(metadata, line); //} //read the reast of the file into a buffer sbuffer << metadata.rdbuf(); metadata.close(); std::cout << "BEGIN BUFFER DUMP" << std::endl; std::cout << sbuffer.str() << std::endl; std::size_t start = sbuffer.str().find(startLine) + startLine.length(); std::size_t stop = sbuffer.str().find(stopLine); std::size_t length = stop-start; if (start != std::string::npos && stop != std::string::npos) sbuffer.str(sbuffer.str().substr(start,length)); std::cout << "Start:" << start << std::endl << "Stop:" << stop << std::endl << "Length:" << length << std::endl; std::cout << "Second Buffer Dump" << std::endl; std::cout << sbuffer.str() << "END" << std::endl; //create an input source for the XML parser xercesc::MemBufInputSource XMLBuf((const XMLByte*)(sbuffer.str().c_str()), sbuffer.str().size(), "InputXML"); //XMLBuf.setEncoding(xercesc::XMLString::transcode("LATIN1")); parser->parse(XMLBuf); std::cout << "Shouldn't make it here..." << std::endl; xercesc::DOMElement* docRootNode; xercesc::DOMDocument* doc; xercesc::DOMNodeIterator* walker; doc = parser->getDocument(); docRootNode = doc->getDocumentElement(); walker = doc->createNodeIterator(docRootNode,xercesc::DOMNodeFilter::SHOW_ELEMENT, NULL,true); //Walk the XML xercesc::DOMNode * CurrentNode = NULL; std::string thisNodeName; std::string parentNodeName; for (CurrentNode = walker->nextNode(); CurrentNode !=0; CurrentNode = walker->nextNode()) { thisNodeName = xercesc::XMLString::transcode(CurrentNode->getNodeName()); parentNodeName = xercesc::XMLString::transcode(CurrentNode->getParentNode()->getNodeName()); std::cout << thisNodeName << std::endl; std::cout << parentNodeName << std::endl; } } xercesc::XMLPlatformUtils::Terminate(); return 0; } compiled with: g++ test.cpp -o test `pkg-config xerces-c --cflags --libs` Just run it with ./test <file> Can anyone help me figure out what I am doing wrong? I know that processing text in C++ can be tricky, and I am probably missing something stupid, but it seems like this should be do-able. Thanks for any help in advance. -C