officeconverter

Asiri Rathnayake Tue, 28 Oct 2008 09:52:40 -0700

Hi Vincent,

On Tue, Oct 28, 2008 at 7:53 PM, Vincent Massol <[EMAIL PROTECTED]> wrote:


> Hi Asiri,
>
> I think I'd really prefer one filter per class. Same as what is done
> in the HTML cleaner. Also please donc use any *Utils class and no
> static please (these are both anti patterns).
>

Ok, reverting now.

Thanks.

- Asiri


>
> Thanks
> -Vincent
>
> On Oct 28, 2008, at 2:54 PM, asiri (SVN) wrote:
>
> > Author: asiri
> > Date: 2008-10-28 14:54:04 +0100 (Tue, 28 Oct 2008)
> > New Revision: 13868
> >
> > Removed:
> >   sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/filter/
> > Modified:
> >   sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/OfficeImporterPlugin.java
> >   sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/utils/HtmlFilterUtils.java
> >   sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/
> > plugin/officeconverter/CleanHTMLTest.java
> > Log:
> > Moved all html filter code into a single utility class called
> > HtmlFilterUtils. I thought of introducing some sort of a filter
> > chain (may be chain of responsibility pattern) but it seemed like an
> > over-kill for this scenario.
> >
> > Modified: sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/
> > xwiki/plugin/officeimporter/OfficeImporterPlugin.java
> > ===================================================================
> > --- sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/OfficeImporterPlugin.java       2008-10-28 11:33:41
> > UTC (rev 13867)
> > +++ sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/OfficeImporterPlugin.java       2008-10-28 13:54:04
> > UTC (rev 13868)
> > @@ -57,14 +57,9 @@
> > import com.xpn.xwiki.doc.XWikiDocument;
> > import com.xpn.xwiki.plugin.XWikiDefaultPlugin;
> > import com.xpn.xwiki.plugin.XWikiPluginInterface;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.EmptyLinkFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.ImageTagFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.PinLiFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.TagRemoveFilter;
> > -import
> > com.xpn.xwiki.plugin.officeimporter.filter.UnderlineLinkFilter;
> > -import
> > com.xpn.xwiki.plugin.officeimporter.filter.XWikiSyntaxEscapeFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
> > import com.xpn.xwiki.plugin.officeimporter.utils.DocumentType;
> > +import com.xpn.xwiki.plugin.officeimporter.utils.HtmlFilterUtils;
> > +import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
> > import com.xpn.xwiki.web.Utils;
> >
> > /**
> > @@ -471,9 +466,7 @@
> >                 HTMLCleaner.ROLE), e);
> >         }
> >         Document document = htmlCleaner.clean(new
> > StringReader(inputHTML));
> > -
> > -        new UnderlineLinkFilter().filter(document);
> > -
> > +        HtmlFilterUtils.filterUnderlinedLinks(document);
> >         XMLUtils.stripHTMLEnvelope(document);
> >         String cleanedHTML = XMLUtils.toString(document);
> >         return cleanedHTML;
> > @@ -499,14 +492,12 @@
> >                 HTMLCleaner.ROLE), e);
> >         }
> >         Document document = htmlCleaner.clean(new
> > StringReader(inputHTML));
> > -
> > -        new TagRemoveFilter().filter(document);
> > -        new UnderlineLinkFilter().filter(document);
> > -        new XWikiSyntaxEscapeFilter().filter(document);
> > -        new ImageTagFilter().filter(document);
> > -        new PinLiFilter().filter(document);
> > -        new EmptyLinkFilter().filter(document);
> > -
> > +        HtmlFilterUtils.filterTags(document, new String[]{"style",
> > "script"});
> > +        HtmlFilterUtils.filterUnderlinedLinks(document);
> > +        HtmlFilterUtils.filterSytaxChars(document);
> > +        HtmlFilterUtils.filterImageLinks(document);
> > +        HtmlFilterUtils.filterParagraphTagsInLineItemTags(document);
> > +        HtmlFilterUtils.filterEmptyLinks(document);
> >         XMLUtils.stripHTMLEnvelope(document);
> >         String cleanedHTML = XMLUtils.toString(document);
> >         return cleanedHTML;
> >
> > Modified: sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/
> > xwiki/plugin/officeimporter/utils/HtmlFilterUtils.java
> > ===================================================================
> > --- sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/utils/HtmlFilterUtils.java      2008-10-28 11:33:41
> > UTC (rev 13867)
> > +++ sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/
> > plugin/officeimporter/utils/HtmlFilterUtils.java      2008-10-28 13:54:04
> > UTC (rev 13868)
> > @@ -1,12 +1,247 @@
> > package com.xpn.xwiki.plugin.officeimporter.utils;
> >
> > +import java.util.ArrayList;
> > +import java.util.List;
> > +
> > +import org.w3c.dom.Document;
> > +import org.w3c.dom.Element;
> > +import org.w3c.dom.NamedNodeMap;
> > +import org.w3c.dom.Node;
> > +import org.w3c.dom.NodeList;
> > +import org.w3c.dom.Text;
> > +
> > /**
> >  * A utility class containing a suite of filter methods used to
> > manipulate Html documents.
> >  *
> >  * @version $Id$
> >  * @since 1.7M1
> >  */
> > -public class HtmlFilterUtils
> > +public abstract class HtmlFilterUtils
> > {
> > +    /**
> > +     * Characters that need to be escaped when jumping from html to
> > xwiki syntax.
> > +     */
> > +    private static final List<String> escapeChars = new
> > ArrayList<String>();
> >
> > +    /**
> > +     * Static initializer for escape chars.
> > +     */
> > +    static {
> > +        escapeChars.add("[");
> > +        escapeChars.add("]");
> > +        escapeChars.add("{");
> > +        escapeChars.add("}");
> > +        escapeChars.add("*");
> > +        escapeChars.add("~");
> > +        escapeChars.add("_");
> > +        escapeChars.add("-");
> > +        escapeChars.add("1");
> > +        escapeChars.add("#");
> > +        escapeChars.add("$");
> > +    }
> > +
> > +    /**
> > +     * Removes empty links from html documents. If the label of the
> > link is empty, simply remove the
> > +     * tag as in [EMAIL PROTECTED] <a/>} or [EMAIL PROTECTED] <a 
> > href=""/>}. If the label
> > is not null but the href
> > +     * attribute is missing, replace the tag with it's label. Like
> > changing [EMAIL PROTECTED] <a>something</a>}
> > +     * to [EMAIL PROTECTED] something}.
> > +     *
> > +     * @param document The html document.
> > +     */
> > +    public static void filterEmptyLinks(Document document)
> > +    {
> > +        Element root = document.getDocumentElement();
> > +        NodeList links = root.getElementsByTagName("a");
> > +        for (int i = 0; i < links.getLength(); i++) {
> > +            Node link = links.item(i);
> > +            if (link.getTextContent() == null ||
> > link.getTextContent().trim().equals("")) {
> > +                link.getParentNode().removeChild(link);
> > +                i--;
> > +                continue;
> > +            }
> > +
> > +            Node hrefAttr =
> > link.getAttributes().getNamedItem("href");
> > +            if (hrefAttr == null ||
> > hrefAttr.getTextContent().trim().equals("")) {
> > +                NodeList children = link.getChildNodes();
> > +                while (children.getLength() > 0) {
> > +
> > link.getParentNode().insertBefore(children.item(0), link);
> > +                }
> > +                link.getParentNode().removeChild(link);
> > +                i--;
> > +            }
> > +        }
> > +    }
> > +
> > +    /**
> > +     * Replaces the [EMAIL PROTECTED] <img>} tags with corresponding 
> > {image}
> > macro elements which are
> > +     * recognized by xwiki syntax 1.0. Handles image attributes
> > like src, width, height, alt, align.
> > +     *
> > +     * @param document The html document.
> > +     */
> > +    public static void filterImageLinks(Document document)
> > +    {
> > +        Element root = document.getDocumentElement();
> > +        NodeList imgs = root.getElementsByTagName("img");
> > +        while (imgs.getLength() > 0) {
> > +            Node image = imgs.item(0);
> > +            String imageCode = generateImageMacroString(image);
> > +            Node parent = image.getParentNode();
> > +            Text newImg = document.createTextNode(imageCode);
> > +            parent.replaceChild(newImg, image);
> > +        }
> > +    }
> > +
> > +    /**
> > +     * Converts a [EMAIL PROTECTED] <img>} element into a xwiki syntax 1.0
> > {image} macro element.
> > +     *
> > +     * @param imageLink Node representing the image link.
> > +     * @return Converted {image} macro string.
> > +     */
> > +    private static String generateImageMacroString(Node imageLink)
> > +    {
> > +        NamedNodeMap attrs = imageLink.getAttributes();
> > +        if (attrs == null) {
> > +            return null;
> > +        }
> > +        StringBuffer sb = new StringBuffer();
> > +        sb.append("{image:");
> > +        if (attrs.getNamedItem("src") != null) {
> > +            String src = attrs.getNamedItem("src").getTextContent();
> > +            sb.append(src);
> > +        }
> > +        if (attrs.getNamedItem("width") != null) {
> > +            String width =
> > attrs.getNamedItem("width").getTextContent();
> > +            sb.append("|width=" + width);
> > +        }
> > +        if (attrs.getNamedItem("height") != null) {
> > +            String height =
> > attrs.getNamedItem("height").getTextContent();
> > +            sb.append("|height=" + height);
> > +        }
> > +        if (attrs.getNamedItem("alt") != null) {
> > +            String alt = attrs.getNamedItem("alt").getTextContent();
> > +            sb.append("|alt=" + alt);
> > +        }
> > +        if (attrs.getNamedItem("align") != null) {
> > +            String align =
> > attrs.getNamedItem("align").getTextContent();
> > +            sb.append("|align=" + align);
> > +        }
> > +        sb.append("}");
> > +        return sb.toString();
> > +    }
> > +
> > +    /**
> > +     * Removes the starting [EMAIL PROTECTED] <p>} tags found within 
> > [EMAIL PROTECTED]
> > <li>} tags. This is useful since
> > +     * such formations are not properly handled in xwiki 1.0 syntax.
> > +     *
> > +     * @param document The html document.
> > +     */
> > +    public static void filterParagraphTagsInLineItemTags(Document
> > document)
> > +    {
> > +        Element root = document.getDocumentElement();
> > +        NodeList lists = root.getElementsByTagName("li");
> > +        for (int i = 0; i < lists.getLength(); i++) {
> > +            Node list = lists.item(i);
> > +            Node firstChild = list.getFirstChild();
> > +            if (firstChild.getNodeName() != null &&
> > firstChild.getNodeName().equals("p")) {
> > +                NodeList childchildren = firstChild.getChildNodes();
> > +                while (childchildren.getLength() > 0) {
> > +                    list.insertBefore(childchildren.item(0),
> > firstChild);
> > +                }
> > +                list.removeChild(firstChild);
> > +            }
> > +        }
> > +    }
> > +
> > +    /**
> > +     * Removes all listed tags from the given html document.
> > +     *
> > +     * @param document The html document.
> > +     * @param tags Tags to be removed.
> > +     */
> > +    public static void filterTags(Document document, String[] tags)
> > +    {
> > +        Element root = document.getDocumentElement();
> > +        for (String tag : tags) {
> > +            NodeList toBeRemovedTags =
> > root.getElementsByTagName(tag);
> > +            while (toBeRemovedTags.getLength() > 0) {
> > +                Node t = toBeRemovedTags.item(0);
> > +                t.getParentNode().removeChild(t);
> > +            }
> > +        }
> > +    }
> > +
> > +    /**
> > +     * Strips off underline tags surrounding links like [EMAIL PROTECTED]
> > <u><a href="something">link</a></u>}.
> > +     *
> > +     * @param document The html document.
> > +     */
> > +    public static void filterUnderlinedLinks(Document document)
> > +    {
> > +        Element root = document.getDocumentElement();
> > +        NodeList links = root.getElementsByTagName("a");
> > +        for (int i = 0; i < links.getLength(); i++) {
> > +            Node link = links.item(i);
> > +            Node parent = link.getParentNode();
> > +            String parentName = parent.getNodeName();
> > +            if (parentName != null && (parentName.equals("u") ||
> > parentName.equals("del"))) {
> > +                parent.getParentNode().insertBefore(link, parent);
> > +                parent.getParentNode().removeChild(parent);
> > +            }
> > +        }
> > +    }
> > +
> > +    /**
> > +     * Escapes the xwiki sytax characters from the given html
> > document. Example : [EMAIL PROTECTED] [} will be
> > +     * replaced by [EMAIL PROTECTED] \]}.
> > +     *
> > +     * @param document The html document.
> > +     */
> > +    public static void filterSytaxChars(Document document)
> > +    {
> > +        Element root = document.getDocumentElement();
> > +        escapeNode(root);
> > +    }
> > +
> > +    /**
> > +     * Escapes xwiki syntax characters within the given node's
> > content.
> > +     *
> > +     * @param node The node which is to be examined.
> > +     */
> > +    private static void escapeNode(Node node)
> > +    {
> > +        NodeList nodes = node.getChildNodes();
> > +        for (int i = 0; i < nodes.getLength(); i++) {
> > +            Node next = nodes.item(i);
> > +            if (next instanceof Text) {
> > +                String text = next.getTextContent();
> > +                text = escapeText(text);
> > +                next.setTextContent(text);
> > +            } else {
> > +                if (next.hasChildNodes()) {
> > +                    escapeNode(next);
> > +                }
> > +            }
> > +        }
> > +    }
> > +
> > +    /**
> > +     * Escapes xwiki syntax characters within the given string.
> > +     *
> > +     * @param text The string to be examined.
> > +     * @return The syntax escaped string.
> > +     */
> > +    private static String escapeText(String text)
> > +    {
> > +        StringBuffer sb = new StringBuffer();
> > +        for (int i = 0; i < text.length(); i++) {
> > +            char x = text.charAt(i);
> > +            if (escapeChars.contains(String.valueOf(x))) {
> > +                sb.append("\\");
> > +                sb.append(String.valueOf(x));
> > +            } else {
> > +                sb.append(x);
> > +            }
> > +        }
> > +        return sb.toString();
> > +    }
> > }
> >
> > Modified: sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/
> > xwiki/plugin/officeconverter/CleanHTMLTest.java
> > ===================================================================
> > --- sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/
> > plugin/officeconverter/CleanHTMLTest.java     2008-10-28 11:33:41 UTC
> > (rev 13867)
> > +++ sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/
> > plugin/officeconverter/CleanHTMLTest.java     2008-10-28 13:54:04 UTC
> > (rev 13868)
> > @@ -27,13 +27,7 @@
> > import org.xwiki.xml.XMLUtils;
> > import org.xwiki.xml.html.HTMLCleaner;
> >
> > -import com.xpn.xwiki.plugin.officeimporter.filter.EmptyLinkFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.HTMLFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.ImageTagFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.PinLiFilter;
> > -import com.xpn.xwiki.plugin.officeimporter.filter.TagRemoveFilter;
> > -import
> > com.xpn.xwiki.plugin.officeimporter.filter.UnderlineLinkFilter;
> > -import
> > com.xpn.xwiki.plugin.officeimporter.filter.XWikiSyntaxEscapeFilter;
> > +import com.xpn.xwiki.plugin.officeimporter.utils.HtmlFilterUtils;
> > import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
> > import com.xpn.xwiki.test.AbstractXWikiComponentTestCase;
> >
> > @@ -121,14 +115,12 @@
> >     private void test(String input, String expected) throws
> > ImporterException
> >     {
> >         Document document = cleaner.clean(new StringReader(input));
> > -
> > -        new TagRemoveFilter().filter(document);
> > -        new UnderlineLinkFilter().filter(document);
> > -        new XWikiSyntaxEscapeFilter().filter(document);
> > -        new ImageTagFilter().filter(document);
> > -        new PinLiFilter().filter(document);
> > -        new EmptyLinkFilter().filter(document);
> > -
> > +        HtmlFilterUtils.filterTags(document, new String[]{"style",
> > "script"});
> > +        HtmlFilterUtils.filterUnderlinedLinks(document);
> > +        HtmlFilterUtils.filterSytaxChars(document);
> > +        HtmlFilterUtils.filterImageLinks(document);
> > +        HtmlFilterUtils.filterParagraphTagsInLineItemTags(document);
> > +        HtmlFilterUtils.filterEmptyLinks(document);
> >         XMLUtils.stripHTMLEnvelope(document);
> >         String actual = XMLUtils.toString(document);
> >         assertEquals(HEAD + expected + FOOT, actual);
> >
> > _______________________________________________
> > notifications mailing list
> > [EMAIL PROTECTED]
> > http://lists.xwiki.org/mailman/listinfo/notifications
>
> _______________________________________________
> devs mailing list
> [email protected]
> http://lists.xwiki.org/mailman/listinfo/devs
>
_______________________________________________
devs mailing list
[email protected]
http://lists.xwiki.org/mailman/listinfo/devs

Re: [xwiki-devs] [xwiki-notifications] r13868 - in sandbox/xwiki-plugin-officeimporter/src: main/java/com/xpn/xwiki/plugin/officeimporter main/java/com/xpn/xwiki/plugin/officeimporter/utils test/java/com/xpn/xwiki/plugin/officeconverter

Reply via email to