knguyen     2004/10/08 18:38:12 CEST

  Modified files:        (Branch: JAHIA-4-0-BRANCH)
    src/java/org/jahia/services/htmlparser TidyHtmlParser.java 
  Log:
  By default, make tidy remove all unknown tags, cleaning world as well.
  If we want it to not remove a certain type of unknown tags, we alway can declare 
them in tidy config file.
  
  Revision  Changes    Path
  1.8.2.3   +185 -91   jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java.diff?r1=1.8.2.2&r2=1.8.2.3&f=h
  
  
  
  Index: TidyHtmlParser.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/src/java/org/jahia/services/htmlparser/Attic/TidyHtmlParser.java,v
  retrieving revision 1.8.2.2
  retrieving revision 1.8.2.3
  diff -u -r1.8.2.2 -r1.8.2.3
  --- TidyHtmlParser.java       5 Oct 2004 19:07:02 -0000       1.8.2.2
  +++ TidyHtmlParser.java       8 Oct 2004 16:38:11 -0000       1.8.2.3
  @@ -51,6 +51,10 @@
       public static String AMPERSAND_SECONDPASS = "$$$amp_secondpass$$$";
       public static String TIDYERRORS_TAG = "TIDYERRORS";
   
  +    private static Vector newInlineTags = new Vector();
  +    private static Vector newBlockLevelTags = new Vector();
  +    private static Vector unrecognizedTags = new Vector();
  +
       private Properties config = new Properties();
   
       public TidyHtmlParser(){}
  @@ -123,99 +127,189 @@
           newInlineTags = JahiaTextContentTidy.JAHIA_HTML_TAG_NAME + ", " + 
newInlineTags;
           config.setProperty(TidyConfig.NEW_INLINE_TAGS,newInlineTags);
           */
  -        tidy.setConfigurationFromProps(tidyConfig);
  -
  -        try {
  -
  -            byte strByte[] = result.getBytes();
  -            strIn = new ByteArrayInputStream(strByte);
  -            strOut = new ByteArrayOutputStream();
  -            ByteArrayOutputStream strErr = new ByteArrayOutputStream();
  -            tidy.setErrout(new PrintWriter( strErr, true));
  -            tidy.setShowWarnings( false );
  -            tidy.parse(strIn, strOut);
  -
  -            strIn.reset();
  -            String tmpValue = strOut.toString();
  -            tmpValue = JahiaTools.replacePattern(tmpValue,"&",AMPERSAND_SECONDPASS);
  -
  -            if ( tmpValue == null ){
  -                tmpValue = "";
  -            }
  -            if ( !"".equals(tmpValue.trim()) ){
  -                strIn = new ByteArrayInputStream(tmpValue.getBytes());
   
  -                DocumentBuilderFactory dfactory = 
DocumentBuilderFactory.newInstance();
  +       Properties config = (Properties) tidyConfig.clone();
  +       String val = tidyConfig.getProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS);
  +       if (val == null) {
  +           val = "";
  +       }
  +       String tag = null;
  +       int size = newBlockLevelTags.size();
  +       for (int i = 0; i < size; i++) {
  +           tag = (String) newBlockLevelTags.get(i);
  +           if (val.length() == 0) {
  +               val = tag;
  +           }
  +           else {
  +               val += "," + tag;
  +           }
  +       }
  +       config.setProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS, val);
  +
  +       val = config.getProperty(TidyConfig.NEW_INLINE_TAGS);
  +       if (val == null) {
  +           val = "";
  +       }
  +       size = newInlineTags.size();
  +       for (int i = 0; i < size; i++) {
  +           tag = (String) newInlineTags.get(i);
  +           if (val.length() == 0) {
  +               val = tag;
  +           }
  +           else {
  +               val += "," + tag;
  +           }
  +       }
  +       config.setProperty(TidyConfig.NEW_INLINE_TAGS, val);
  +
  +       tidy.setConfigurationFromProps(config);
  +
  +       try {
  +
  +           byte strByte[] = result.getBytes();
  +           strIn = new ByteArrayInputStream(strByte);
  +           strOut = new ByteArrayOutputStream();
  +           ByteArrayOutputStream strErr = new ByteArrayOutputStream();
  +           tidy.setErrout(new PrintWriter(strErr, true));
  +           tidy.setShowWarnings(false);
  +           tidy.parse(strIn, strOut);
  +
  +           strIn.reset();
  +           String tmpValue = strOut.toString();
  +           tmpValue = JahiaTools.replacePattern(tmpValue, "&",
  +                                                AMPERSAND_SECONDPASS);
  +
  +           if (tmpValue == null) {
  +               tmpValue = "";
  +           }
  +           if (!"".equals(tmpValue.trim())) {
  +               strIn = new ByteArrayInputStream(tmpValue.getBytes());
  +
  +               DocumentBuilderFactory dfactory = DocumentBuilderFactory.
  +                   newInstance();
  +
  +               EntityResolver et = null;
  +               try {
  +                   et = ServicesRegistry.getInstance().
  +                       getJahiaWebAppsDeployerService().getDtdEntityResolver();
  +               }
  +               catch (Throwable t) {
  +               }
  +               DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
  +               if (et != null) {
  +                   docBuilder.setEntityResolver(et);
  +               }
  +
  +               Document doc = docBuilder.parse(strIn);
  +
  +               RemoveUnrecognizedMarkupVisitor rumv =
  +                   new RemoveUnrecognizedMarkupVisitor();
  +
  +               synchronized(unrecognizedTags){
  +                   size = unrecognizedTags.size();
  +                   for (int i = 0; i < size; i++) {
  +                       rumv.addTag( (String) unrecognizedTags.get(i));
  +                   }
  +               }
  +
  +               doc = rumv.parseDOM(doc);
  +
  +               size = DOMVisitors.size();
  +               for (int i = 0; i < size; i++) {
  +                   HtmlDOMVisitor visitor = (HtmlDOMVisitor) DOMVisitors.get(i);
  +                   doc = visitor.parseDOM(doc);
  +               }
  +
  +               doc.normalize();
  +
  +               TransformerFactory tfactory = TransformerFactory.newInstance();
  +
  +               // This creates a transformer that does a simple identity transform,
  +               // and thus can be used for all intents and purposes as a serializer.
  +               Transformer serializer = tfactory.newTransformer();
  +
  +               serializer.setOutputProperty(OutputKeys.METHOD, "xml");
  +               serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  +               
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount";, "4");
  +               //serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, 
"4");
  +               strOut.reset();
  +               serializer.transform(new DOMSource(doc),
  +                                    new StreamResult(strOut));
  +
  +               if (tidy.getParseErrors() > 0) {
  +                   result = "<TIDYERRORS>\n" + strErr.toString() +
  +                       "</TIDYERRORS>";
  +               }
  +               else {
  +                   result = strOut.toString();
  +               }
  +               result = JahiaTools.replacePattern(result, AMPERSAND_SECONDPASS,
  +                   "&");
  +               result = JahiaTools.text2XMLEntityRef(result, 1);
  +               result = JahiaTools.replacePattern(result, AMPERSAND, "&");
  +
  +           }
  +           else if (tidy.getParseErrors() > 0) {
  +               String err = strErr.toString();
  +               result = "<TIDYERRORS>\n" + err + "</TIDYERRORS>";
  +               if (err.indexOf("is not recognized!") != -1) {
  +                   err = JahiaTools.replacePatternIgnoreCase(err.toLowerCase(),
  +                       " - error: ", "@@@");
  +                   String[] errors = org.jahia.utils.JahiaTools.getTokens(
  +                       err, "@@@");
  +                   if (errors.length > 0) {
  +                       String token = "";
  +                       ArrayList tags = new ArrayList();
  +                       tag = null;
  +                       String newInput = input;
  +                       int pos = -1;
  +                       for (int i = 0; i < errors.length; i++) {
  +                           token = errors[i];
  +                           pos = token.indexOf(" is not recognized!");
  +                           if (pos != -1) {
  +                               try {
  +                                   tag = token.substring(0,pos);
  +                                   if ( !tag.startsWith("<") ){
  +                                       // we found an unknown empty tag
  +                                       synchronized(unrecognizedTags){
  +                                           if (unrecognizedTags.contains(tag)) {
  +                                               continue;
  +                                           }
  +                                           else {
  +                                               unrecognizedTags.add(tag);
  +                                               newInlineTags.add(tag);
  +                                           }
  +                                       }
  +                                   } else {
  +                                       tag = tag.substring(1, tag.length() - 1);
  +                                       synchronized(unrecognizedTags){
  +                                           if (unrecognizedTags.contains(tag)) {
  +                                               continue;
  +                                           }
  +                                           else {
  +                                               unrecognizedTags.add(tag);
  +                                               newBlockLevelTags.add(tag);
  +                                           }
  +                                       }
  +                                   }
  +                               }
  +                               catch (Throwable t) {
  +                               }
  +                           }
  +                       }
  +                       result = parse(input, siteId, tidyConfig,
  +                                      DOMVisitors);
  +                   }
  +               }
  +           }
  +       }
  +       catch (Exception e) {
  +           e.printStackTrace();
  +           return input;
  +       }
   
  -                EntityResolver et = null;
  -                try {
  -                    et = 
ServicesRegistry.getInstance().getJahiaWebAppsDeployerService().getDtdEntityResolver();
  -                } catch ( Throwable t ){
  -                }
  -                DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
  -                if ( et != null ){
  -                    docBuilder.setEntityResolver(et);
  -                }
  -
  -                Document doc = docBuilder.parse(strIn);
  -
  -                int size = DOMVisitors.size();
  -                for ( int i=0 ; i<size; i++ ){
  -                    HtmlDOMVisitor visitor = (HtmlDOMVisitor)DOMVisitors.get(i);
  -                    doc = visitor.parseDOM(doc);
  -                }
  -
  -                doc.normalize();
  -
  -                TransformerFactory tfactory = TransformerFactory.newInstance();
  -
  -                // This creates a transformer that does a simple identity transform,
  -                // and thus can be used for all intents and purposes as a 
serializer.
  -                Transformer serializer = tfactory.newTransformer();
  -
  -                serializer.setOutputProperty(OutputKeys.METHOD, "xml");
  -                serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  -                
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount";, "4");
  -                
//serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "4");
  -                strOut.reset();
  -                serializer.transform (new DOMSource(doc),
  -                                      new  StreamResult(strOut));
  -
  -                if ( tidy.getParseErrors() > 0 ) {
  -                    result = "<TIDYERRORS>\n" + strErr.toString() + "</TIDYERRORS>";
  -                } else {
  -                    result = strOut.toString();
  -                }
  -                result = JahiaTools.replacePattern(result,AMPERSAND_SECONDPASS,"&");
  -                result = JahiaTools.text2XMLEntityRef(result,1);
  -                result = JahiaTools.replacePattern(result,AMPERSAND,"&");
  -
  -            } else if ( tidy.getParseErrors() > 0 ){
  -                result = "<TIDYERRORS>\n" + strErr.toString() + "</TIDYERRORS>";
  -                result = JahiaTools.replacePatternIgnoreCase(result," is not 
recognized!","@@@");
  -                String[] errors = 
org.jahia.utils.JahiaTools.getTokens(result,"@@@");
  -                if ( errors.length>0 ){
  -                    String token = "";
  -                    ArrayList tags = new ArrayList();
  -                    String tag = null;
  -                    String newInput = input;
  -                    for (int i = 0; i < errors.length; i++) {
  -                        token = errors[i];
  -                        tag = token.substring(token.lastIndexOf("<"),
  -                                              token.lastIndexOf(">"));
  -                        newInput = 
JahiaTools.replacePatternIgnoreCase(newInput,tag,"$$$notrecognizedtag$$$"+tag.substring(1));
  -                    }
  -                    result = parse( newInput, siteId, tidyConfig, DOMVisitors);
  -                    result = JahiaTools.replacePatternIgnoreCase 
(result,"$$$notrecognizedtag$$$","<");
  -                }
  -            }
  -        } catch (Exception e) {
  -            e.printStackTrace();
  -            return input;
  -        }
  -
  -        return result.trim();
  -    }
  +       return result.trim();
  +   }
   
       /**
        * Clone a Tidy DOM Tree and return a JAXP 2 DOM Tree
  

Reply via email to