Hi all -

I have a situation where I have to process bad XML - ie like the following:
 (saved as test_metadata.xml)

<?xml version="1.0" encoding="utf-8"?>
<MyTest>
<Version>4354</Version>
</MyTest>
<RemoveMe>
<?xml version="1.0" encoding="utf-8"?>
<Metadata>
<Version>4356</Version>
<a>4928</a>
<b>6400</b>
</Metadata>
</RemoveMe>

it is like 2 XML files in one.  What I am doing is reading the file into a
stringstream buffer, taking only the contents between the remove me tag,
and treting it as my memory buffer to be parsed.  I keep getting invalid
multibyte sequence errors.  I am running a UTF8 enabled system, and am
pretty sure that I can save UTF8 files.  I have tried the second file I
made with emacs and printed it with DOMPrint:

saved as test2_metadata.xml

<?xml version="1.0" encoding="utf-8"?>
<MyTest>
<Version>4354</Version>
<Id>1</Id>
</MyTest>

but my own code still gives me an invalid multibyte sequence in the first
line.

My code is:

#include <fstream>
#include <string>
#include <sstream>
#include <iostream>

//stuff to parse XML
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/dom/DOM.hpp>
#include <xercesc/sax/HandlerBase.hpp>
#include <xercesc/framework/MemBufInputSource.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XercesDefs.hpp>

class XmlDomErrorHandler : public xercesc::HandlerBase
{
  public:
    void fatalError(const xercesc::SAXParseException &exc) {
        printf("Fatal parsing error at line %d, col %d\n",
(int)exc.getLineNumber(), (int)exc.getColumnNumber());
        char* msg = xercesc::XMLString::transcode( exc.getMessage() );
    printf("%s\n", msg);
        xercesc::XMLString::release( &msg );
    exit(-1);
    }
};

int main(int argc, char* argv[])
{
    std::ifstream metadata(argv[1]);
    std::string line;
    std::string startLine("<RemoveMe>\n");
    std::string stopLine("</RemoveMe>\n");
    std::stringstream sbuffer;
    xercesc::XercesDOMParser* parser;
    xercesc::ErrorHandler* errorHandler;

    try { xercesc::XMLPlatformUtils::Initialize(); }
    catch (const xercesc::XMLException& toCatch)
    {
    char* message = xercesc::XMLString::transcode(toCatch.getMessage());
    std::cout << "Error during Xerces initalization! :" << std::endl <<
message << std::endl;
    xercesc::XMLString::release(&message);
    return 1;
    }

    {
    std::cout << "Init good..." << std::endl;

    parser = new xercesc::XercesDOMParser();
    errorHandler = (xercesc::ErrorHandler*) new XmlDomErrorHandler();
    parser->setErrorHandler(errorHandler);
    //int count = 0;
    //while( line != stopLine )
    //{
    //    std::getline(metadata, line);
    //}


    //read the reast of the file into a buffer

    sbuffer << metadata.rdbuf();

    metadata.close();

    std::cout << "BEGIN BUFFER DUMP" << std::endl;

    std::cout << sbuffer.str() << std::endl;

    std::size_t start = sbuffer.str().find(startLine) + startLine.length();
    std::size_t stop  = sbuffer.str().find(stopLine);
    std::size_t length = stop-start;


    if (start != std::string::npos && stop != std::string::npos)
        sbuffer.str(sbuffer.str().substr(start,length));

    std::cout << "Start:" << start << std::endl << "Stop:" << stop <<
std::endl << "Length:" << length << std::endl;
    std::cout << "Second Buffer Dump" << std::endl;
    std::cout << sbuffer.str() << "END" << std::endl;

    //create an input source for the XML parser
    xercesc::MemBufInputSource XMLBuf((const
XMLByte*)(sbuffer.str().c_str()), sbuffer.str().size(), "InputXML");


    //XMLBuf.setEncoding(xercesc::XMLString::transcode("LATIN1"));

    parser->parse(XMLBuf);

    std::cout << "Shouldn't make it here..." << std::endl;
    xercesc::DOMElement* docRootNode;
    xercesc::DOMDocument* doc;
    xercesc::DOMNodeIterator* walker;

    doc = parser->getDocument();
    docRootNode = doc->getDocumentElement();

    walker =
doc->createNodeIterator(docRootNode,xercesc::DOMNodeFilter::SHOW_ELEMENT,
NULL,true);

    //Walk the XML
    xercesc::DOMNode * CurrentNode = NULL;
    std::string thisNodeName;
    std::string parentNodeName;

    for (CurrentNode = walker->nextNode(); CurrentNode !=0; CurrentNode =
walker->nextNode())
    {
        thisNodeName =
xercesc::XMLString::transcode(CurrentNode->getNodeName());
        parentNodeName =
xercesc::XMLString::transcode(CurrentNode->getParentNode()->getNodeName());

        std::cout << thisNodeName << std::endl;
        std::cout << parentNodeName << std::endl;
    }
    }

    xercesc::XMLPlatformUtils::Terminate();

    return 0;

}

compiled with:

g++ test.cpp -o test `pkg-config xerces-c --cflags --libs`

Just run it with ./test <file>

Can anyone help me figure out what I am doing wrong?  I know that
processing text in C++ can be tricky, and I am probably missing something
stupid, but it seems like this should be do-able.

Thanks for any help in advance.

-C

Reply via email to