tdraier     2005/11/09 12:00:37 CET

  Modified files:
    core/src/java/org/jahia/services/htmlparser 
                                                HtmlParserBaseService.java 
                                                NekoHtmlParser.java 
    core/src/webapp/WEB-INF/etc/spring 
                                       applicationcontext-basejahiaconfig.xml 
  Log:
  fixed html parser, cleaned neko parser
  
  Revision  Changes    Path
  1.11      +4 -4      
jahia/core/src/java/org/jahia/services/htmlparser/HtmlParserBaseService.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/htmlparser/HtmlParserBaseService.java.diff?r1=1.10&r2=1.11&f=h
  1.4       +31 -55    
jahia/core/src/java/org/jahia/services/htmlparser/NekoHtmlParser.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/htmlparser/NekoHtmlParser.java.diff?r1=1.3&r2=1.4&f=h
  1.7       +2 -2      
jahia/core/src/webapp/WEB-INF/etc/spring/applicationcontext-basejahiaconfig.xml
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/webapp/WEB-INF/etc/spring/applicationcontext-basejahiaconfig.xml.diff?r1=1.6&r2=1.7&f=h
  
  
  
  Index: HtmlParserBaseService.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/core/src/java/org/jahia/services/htmlparser/HtmlParserBaseService.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- HtmlParserBaseService.java        4 Oct 2005 16:02:38 -0000       1.10
  +++ HtmlParserBaseService.java        9 Nov 2005 11:00:36 -0000       1.11
  @@ -88,7 +88,7 @@
               markupDefinitions = 
markupDefinitionManager.getAllMarkupDefinitions();
   
           } catch ( Throwable t ){
  -            logger.debug("Error loading Markup Settings",t);
  +            logger.error("Error loading Markup Settings",t);
               markupDefinitions = new Hashtable();
               markupSettings = new Hashtable();
           }
  @@ -369,11 +369,11 @@
                   }
                   return parser;
               } catch (ClassNotFoundException cnfe) {
  -                logger.debug(cnfe);
  +                logger.error(cnfe);
               } catch (InstantiationException ie) {
  -                logger.debug(ie);
  +                logger.error(ie);
               } catch (IllegalAccessException iae) {
  -                logger.debug(iae);
  +                logger.error(iae);
               }
           }
           return null;
  
  
  
  Index: NekoHtmlParser.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/core/src/java/org/jahia/services/htmlparser/NekoHtmlParser.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- NekoHtmlParser.java       2 Aug 2005 13:31:57 -0000       1.3
  +++ NekoHtmlParser.java       9 Nov 2005 11:00:36 -0000       1.4
  @@ -2,6 +2,8 @@
   
   import java.io.ByteArrayInputStream;
   import java.io.ByteArrayOutputStream;
  +import java.io.StringReader;
  +import java.io.StringWriter;
   import java.util.Vector;
   import javax.xml.transform.OutputKeys;
   import javax.xml.transform.Transformer;
  @@ -13,6 +15,7 @@
   import org.jahia.utils.JahiaTools;
   import org.jahia.utils.fileparsers.CharsetDetection;
   import org.w3c.dom.Document;
  +import org.xml.sax.InputSource;
   
   /**
    *
  @@ -24,31 +27,31 @@
    * @version 1.0
    */
   public class NekoHtmlParser implements HtmlParser {
  -    
  +
       public static final String AMPERSAND = "$$$amp$$$";
       private static final DOMParser domParser = new DOMParser();
  -    
  +
       private static final org.apache.log4j.Logger logger =
               org.apache.log4j.Logger.getLogger(NekoHtmlParser.class);
  -    
  -    private NekoHtmlParser() {
  +
  +    public NekoHtmlParser() {
           try {
               
domParser.setProperty("http://cyberneko.org/html/properties/names/elems";, 
"match");
               
domParser.setProperty("http://cyberneko.org/html/properties/names/attrs";, 
"no-change");
  -            
domParser.setProperty("http://cyberneko.org/html/properties/doctype/pubid";, 
null);
  -            
domParser.setProperty("http://cyberneko.org/html/properties/doctype/sysid";, 
null);
  +//            
domParser.setProperty("http://cyberneko.org/html/properties/doctype/pubid";, 
null);
  +//            
domParser.setProperty("http://cyberneko.org/html/properties/doctype/sysid";, 
null);
           } catch (Exception e) {
               logger.fatal(e.getMessage(), e);
           }
       }
  -    
  +
       /**
        *
        * @param htmlParserService HtmlParserService
        */
       public void init(final HtmlParserService htmlParserService){
       }
  -    
  +
       /**
        * Parses and generates a clean html document, remove unwanted markups,..
        * Using default settings
  @@ -60,7 +63,7 @@
       public String parse(final String inputString, final Vector DOMVisitors){
           return parse(inputString,-1,DOMVisitors);
       }
  -    
  +
       /**
        * Parses and generates a clean html document, remove unwanted markups,..
        * Using settings as defined for a given site
  @@ -71,13 +74,13 @@
        * @return
        */
       public String parse(final String inputString, final Vector DOMVisitors,
  -            final int siteId){
  +                        final int siteId){
           if ( inputString == null || inputString.trim().equals("") ){
               return inputString;
           }
           return parse(inputString,siteId,DOMVisitors);
       }
  -    
  +
       /**
        * Parses and generates a clean html document, remove unwanted markups,..
        * Using settings as defined for a given site
  @@ -88,74 +91,47 @@
        * @return
        */
       public static String parse( final String input,
  -            final int siteId,
  -            final Vector DOMVisitors){
  -        
  +                                final int siteId,
  +                                final Vector DOMVisitors){
  +
           if ( input == null || "".equals(input.trim())){
               return input;
           }
  -        
  +
           String result = new String(input);
           result = JahiaTools.replacePattern(result, "&", AMPERSAND);
  -        
  -        ByteArrayInputStream strIn;
  -        final ByteArrayOutputStream strOut = new ByteArrayOutputStream();
  -        byte[] strByte = null;
  -        String charSet = null; // by default open as ascii
  -        final CharsetDetection charsetDet = new CharsetDetection();
  -        try {
  -            strByte = org.apache.commons.io.IOUtils.toByteArray(result);
  -            strIn = new ByteArrayInputStream(strByte);
  -            charsetDet.charsetDetection(strIn);
  -            charSet = charsetDet.getCharset();
  -        } catch ( Throwable t ){
  -            logger.error("Error parsing the document", t);
  -        }
  -        
  +
           int size = 0;
           try {
  -            if ( charSet == null ){
  -                strByte = result.getBytes();
  -            } else {
  -                strByte = result.getBytes(charSet);
  -            }
  -            strIn = new ByteArrayInputStream(strByte);
  -            org.xml.sax.InputSource in = new org.xml.sax.InputSource(strIn);
  -            
domParser.setProperty("http://cyberneko.org/html/properties/default-encoding";, 
charSet);
  +            InputSource in = new InputSource(new StringReader(input));
               domParser.parse(in);
               Document doc = domParser.getDocument();
  -            
  +
               size = DOMVisitors.size();
               for (int i = 0; i <size; i++) {
                   final HtmlDOMVisitor visitor = (HtmlDOMVisitor) 
DOMVisitors.get(i);
                   doc = visitor.parseDOM(doc);
               }
  -            
  +
               doc.normalize();
               final TransformerFactory tfactory = 
TransformerFactory.newInstance();
  -            
  +
               // This creates a transformer that does a simple identity 
transform,
               // and thus can be used for all intents and purposes as a 
serializer.
               final Transformer serializer = tfactory.newTransformer();
  -            
  +
               serializer.setOutputProperty(OutputKeys.METHOD, "html");
               serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  -            if ( charSet != null ){
  -                serializer.setOutputProperty(OutputKeys.ENCODING, charSet);
  -            }
  -            
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount";, 
"4");
  +
               
serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "2");
  -            serializer.transform(new DOMSource(doc),
  -                    new StreamResult(strOut));
  -            if ( charSet == null ){
  -                result = strOut.toString();
  -            } else {
  -                result = strOut.toString(charSet);
  -            }
  -            
  +            StringWriter writer = new StringWriter();
  +            serializer.transform(new DOMSource(doc), new 
StreamResult(writer));
  +
  +            result = writer.toString();
  +
               result = JahiaTools.text2XMLEntityRef(result, 1);
               result = JahiaTools.replacePattern(result, AMPERSAND, "&");
  -            
  +
           } catch ( Throwable t ){
               logger.error("Error parsing the document", t);
               return input;
  
  
  
  Index: applicationcontext-basejahiaconfig.xml
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/core/src/webapp/WEB-INF/etc/spring/applicationcontext-basejahiaconfig.xml,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- applicationcontext-basejahiaconfig.xml    24 Oct 2005 12:42:49 -0000      
1.6
  +++ applicationcontext-basejahiaconfig.xml    9 Nov 2005 11:00:36 -0000       
1.7
  @@ -21,10 +21,10 @@
        <bean id="jahiaConfigBean" class="org.jahia.bin.JahiaConfig">
                <property name="settings">
                        <props>
  +                <!--
                                <prop 
key="org.jahia.services.htmlparser.HtmlParser">org.jahia.services.htmlparser.NekoHtmlParser</prop>
  -                             <!--
  -                             <prop 
key="org.jahia.services.htmlparser.HtmlParser">org.jahia.services.htmlparser.TidyHtmlParser</prop>
                                -->
  +                             <prop 
key="org.jahia.services.htmlparser.HtmlParser">org.jahia.services.htmlparser.TidyHtmlParser</prop>
                        </props>
                </property>
        </bean>
  

Reply via email to