Gerben,

Maybe this is of some use. The Lua code in the file checks the xml and returns info when something is wrong.

I am using it for example in my takenotes module as:
<notes file="events/na-30416155-5-48v.xml" checkxml="yes"/>

And this is a sketch of how it is embedded in that module:

% Prerun an xml check.
\doifinset{\xmlatt{#1}{checkxml}}{yes,on,true}%
{\xmlcommand{#1}{.}{xmlcommon:checkxml}}

% Execute this code if no error was found.
etc.

dr. Hans van der Meer

\startluacode
        -- Define our namespace as hvdm
        hvdm = hvdm or {}

-- Function checks the correctness of an XML tree.
        hvdm.checkXMLtree = function (xml, linesep)

        local xml = xml and xml or ""
        local linesep = linesep and linesep or "\n"
        local badspace = {}

        -- Empty files do not need processing.
        if xml == "" then
                return ""
        end

        -- Step one is remove leading characters before first node.
        local leadingnode = string.find(xml, "<", 1) or 1
        if leadingnode ~= 1 then
                xml = string.sub(xml, leadingnode, -1)
        end

        -- Strip all comments <!-- --> but node pairs inside make it 
complicated.
        local c1, c2 = string.find(xml, "<!--", 1, true)
        while c1 do
                -- Search for closing of comment.
                local c3, c4 = string.find(xml, "-->", c2 + 1, true)
                if c3 then
                        -- Remove comment.
                        xml = string.sub(xml, 1, c1 - 1) .. string.sub(xml, c4 
+ 1, -1)

                        -- Search for next comment.
                        c1, c2 = string.find(xml, "<!--", 1, true)
                else
                        badspace[#badspace + 1] = " >>> Open comment <!-- Fix 
this first"
                        c1 = nil
                end
        end

        -- Strip <?xml version ... ?> and DOCTYPE nodes.
        if string.find(xml, "<%?xml.*%?>") then
                xml = string.gsub(xml, "<%?xml.*%?>", "")
        else
                badspace[#badspace + 1] =
                        " >>> missing or incorrect <?xml ?>"
        end
        xml = string.gsub(xml, "<.*DOCTYPE[^>]*>", "")

        -- Strip everything between > and <.
        xml = string.gsub(xml, ">[^<]*<", "><")

        -- Spot all cases  of '< node', etc. which have bad spaces.
        for node in string.gmatch(xml, "</?%s+[^>]*") do
                badspace[#badspace + 1] = " >>> erroneous whitespace in " .. 
node .. ">"
        end
        for node in string.gmatch(xml, "<[^>]*%s+/?>") do
                badspace[#badspace + 1] = " >>> erroneous whitespace in " .. 
node
        end

        -- Remove the bad spaces.
        if #badspace > 0 then
                xml = string.gsub(xml, "<%s+", "<")
                xml = string.gsub(xml, "</%s+", "</")
                xml = string.gsub(xml, "%s+>", ">")
                xml = string.gsub(xml, "%s+/>", "/>")
        end

        -- Strip all <xx/> nodes, must be done before stripping attributes.
        xml = string.gsub(xml, "<[^>]*/>", "")

        -- Strip attributes.
        local fa1, fa2, fa3 = 0, 0, 0
        while fa3 do
                local fa1,fa2 = string.find(xml, "<%S+%s+", 1)
                if fa1 then
                        _, fa3 = string.find(xml, ">", fa2, true)
                        if fa3 then
                                xml = string.sub(xml, 1, fa2-1) .. 
string.sub(xml, fa3, -1)
                        end
                end
        end

        -- Strip all remaining whitespace.
        xml = string.gsub(xml, "%s", "")

        local function nextnode(str, last)
                local newlast
                if last ~= nil then
                        last, newlast = string.find(str, "</?[^>]*>", last + 1)
                end
                return newlast
        end

        -- At this point we have the bare <node> tree.
        -- Strip all adjacent pairs <node></node> recursively.
        -- The xml is correct when nothing is left.
        local function nextnode(str, last)
                local newlast
                if last ~= nil then
                        last, newlast = string.find(str, "</?[^>]*>", last)
                end
                return newlast
        end

        -- Remove adjacent pairs <node></node>.
        local i1, i2, j1, j2
        i1, i2 = 1, nextnode(xml, 1)
        if i2 ~= nil then
                j1, j2 = i2 + 1, nextnode(xml, i2 + 1)
        end

        -- Loop until nothing left for a correct xml tree.
        while j1 ~= nil do

                -- Check if we have node> and /node>.
                if string.sub(xml, i1 + 1, i2) == string.sub(xml, j1 + 2, j2) 
then

                        -- Here is a matching pair, remove all of them.
                        local removal = string.sub(xml, i1, i2) .. 
string.sub(xml, j1, j2)
                        xml = string.gsub(xml, removal, "")

                        -- Start over at the beginning.
                        i1, i2 = 1, nextnode(xml, 1)
                        if i2 ~= nil then
                                j1, j2 = i2 + 1, nextnode(xml, i2 + 1)
                        else
                                j1 = nil
                        end
                else
                        -- Shift to next pair of nodes.
                        if j2 ~= nil then
                                i1, i2, j1, j2 = j1, j2, j2 + 1, nextnode(xml, 
j2 + 1)
                        else
                                j1 = nil
                        end
                end
        end

        -- If something is leftover then the xml tree is not correct.
        if #xml > 0 then
                badspace[#badspace + 1] = " >>> " .. xml
        end

        -- Note that xml might end up as a single newline chracter.
        if #badspace > 0 then
                return "XML tree is not correct" .. linesep ..
                                        table.concat(badspace, linesep) .. 
linesep
        end
        return ""

end     -- end of checkXMLtree

-- Function checks the correctness of an XML tree in given file.
        hvdm.checkXMLfile = function (filename)

        -- Filename must have suffix .xml.
        if hvdm.filesuffix(filename) ~= "xml" then
                return "Filename of " .. filename .. " missing suffix .xml"
        end

        -- Open the file for reading and check.
        local inputfile = io.open(filename, "r")
        if inputfile then
                -- Read contents into a string then close the file.
                local xml = inputfile:read("a")
                io.close(inputfile)

                -- Execute the test.
                local outcome = hvdm.checkXMLtree(xml, "\\crlf ")
                return outcome == "" and ""
                                                or  "XML check of file " .. 
filename .. " : " .. outcome
                -- Note that an empty tree has no errors.
        end
end     -- end of checkXMLfile

-- Function checks the correctness of an XML tree in given directory.
        hvdm.checkXMLfiles = function (directoryname)

        -- Make temp file for directory listing and open for writing.
        local tempname = "/tmp/hvdm_list_tmp.txt"
        local temp = io.open(tempname,"w+")

        -- Write diretory listingof .xml files to the temp file.
        os.execute("ls -1 " .. directoryname .. "/*" .. ".xml" .. " >" .. 
tempname)

        -- Reset temp file for reading of the directory entries.
        temp:seek("set")

        -- Retrieve line by line the filenames and put into a table.
        local filenames = {}
        local filename = temp:read("l")
        while filename ~= nil do
                filenames[#filenames + 1] = filename
                filename = temp:read("l")
        end

        -- Close the tempfile and remove it and its name.
        temp:close()
        os.remove(tempname)

        -- Initialize combined result.
        local outcome = ""

        -- Check one by one the xml files from the table.
        for _,name in ipairs(filenames) do
                outcome = outcome .. hvdm.checkXMLfile(name)
        end

        -- Return combined outcome.
        return outcome

end     -- end of checkXMLfiles

\stopluacode

On 13 May 2020, at 23:00, Gerben Wierda <gerben.wie...@rna.nl> wrote:

I am loading an xml.file. I sometimes make mistakes editing that file and then xml.load creates a string that contains

<error>invalid xml file - parsed text</error>

At that point I check my file and sometimes (like now) I cannot find the error.

Is there a way to make xml.load somewhat more verbose about where it encounters the error? 

G
___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the Wiki!

maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage  : http://www.pragma-ade.nl / http://context.aanhet.net
archive  : https://bitbucket.org/phg/context-mirror/commits/
wiki     : http://contextgarden.net
___________________________________________________________________________________

___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the 
Wiki!

maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage  : http://www.pragma-ade.nl / http://context.aanhet.net
archive  : https://bitbucket.org/phg/context-mirror/commits/
wiki     : http://contextgarden.net
___________________________________________________________________________________

Reply via email to