knguyen     2004/10/13 14:16:19 CEST

  Modified files:        (Branch: JAHIA-4-0-BRANCH)
    src/java/org/jahia/services/htmlparser TidyConfig.java 
                                           TidyHtmlParser.java 
  Added files:           (Branch: JAHIA-4-0-BRANCH)
    src/java/org/jahia/services/htmlparser TagRemover.java 
  Log:
  - tidy and charset issue
  
  Revision  Changes    Path
  1.1.2.1   +125 -0    jahia/src/java/org/jahia/services/htmlparser/TagRemover.java 
(new)
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TagRemover.java?rev=1.1.2.1&content-type=text/plain
  1.1.4.2   +1 -0      jahia/src/java/org/jahia/services/htmlparser/TidyConfig.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TidyConfig.java.diff?r1=1.1.4.1&r2=1.1.4.2&f=h
  1.8.2.4   +48 -11    jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java.diff?r1=1.8.2.3&r2=1.8.2.4&f=h
  
  
  
  
  
  
  Index: TidyConfig.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/src/java/org/jahia/services/htmlparser/Attic/TidyConfig.java,v
  retrieving revision 1.1.4.1
  retrieving revision 1.1.4.2
  diff -u -r1.1.4.1 -r1.1.4.2
  --- TidyConfig.java   8 Oct 2004 16:38:18 -0000       1.1.4.1
  +++ TidyConfig.java   13 Oct 2004 12:16:19 -0000      1.1.4.2
  @@ -124,6 +124,7 @@
   
   public class TidyConfig {
   
  +    public static final String CHAR_ENCODING = "char-encoding";
       public static final String NEW_INLINE_TAGS = "new-inline-tags";
       public static final String NEW_EMTY_TAGS = "new-empty-tags";
       public static final String NEW_BLOCK_LEVEL_TAGS = "new-blocklevel-tags";
  
  
  
  Index: TidyHtmlParser.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/src/java/org/jahia/services/htmlparser/Attic/TidyHtmlParser.java,v
  retrieving revision 1.8.2.3
  retrieving revision 1.8.2.4
  diff -u -r1.8.2.3 -r1.8.2.4
  --- TidyHtmlParser.java       8 Oct 2004 16:38:11 -0000       1.8.2.3
  +++ TidyHtmlParser.java       13 Oct 2004 12:16:19 -0000      1.8.2.4
  @@ -35,6 +35,7 @@
   import org.jahia.utils.JahiaTools;
   import org.jahia.utils.TextHtml;
   import java.util.ArrayList;
  +import org.jahia.utils.fileparsers.CharsetDetection;
   
   /**
    *
  @@ -110,6 +111,7 @@
           if ( input == null || "".equals(input.trim())){
               return input;
           }
  +
           String result = new String(input);
           result = JahiaTools.replacePattern(result,"&",AMPERSAND);
   
  @@ -160,13 +162,32 @@
                  val += "," + tag;
              }
          }
  +
          config.setProperty(TidyConfig.NEW_INLINE_TAGS, val);
   
  +       // charset
  +       byte[] strByte = null;
  +       String charSet = null; // by default open as ascii
  +       CharsetDetection charsetDet = new CharsetDetection();
  +       try {
  +           strByte = org.apache.commons.io.IOUtils.toByteArray(result);
  +           strIn = new ByteArrayInputStream(strByte);
  +           charsetDet.charsetDetection(strIn);
  +           charSet = charsetDet.getCharset();
  +           if ( charSet != null && "UTF-8".equalsIgnoreCase(charSet) ){
  +               config.setProperty(TidyConfig.CHAR_ENCODING,"utf8");
  +           }
  +       } catch ( Throwable t ){
  +       }
  +
          tidy.setConfigurationFromProps(config);
   
          try {
  -
  -           byte strByte[] = result.getBytes();
  +           if ( charSet == null ){
  +               strByte = result.getBytes();
  +           } else {
  +               strByte = result.getBytes(charSet);
  +           }
              strIn = new ByteArrayInputStream(strByte);
              strOut = new ByteArrayOutputStream();
              ByteArrayOutputStream strErr = new ByteArrayOutputStream();
  @@ -175,7 +196,12 @@
              tidy.parse(strIn, strOut);
   
              strIn.reset();
  -           String tmpValue = strOut.toString();
  +           String tmpValue = null;
  +           if ( charSet == null ){
  +               tmpValue = strOut.toString();
  +           } else {
  +               tmpValue = strOut.toString(charSet);
  +           }
              tmpValue = JahiaTools.replacePattern(tmpValue, "&",
                                                   AMPERSAND_SECONDPASS);
   
  @@ -183,7 +209,12 @@
                  tmpValue = "";
              }
              if (!"".equals(tmpValue.trim())) {
  -               strIn = new ByteArrayInputStream(tmpValue.getBytes());
  +               if ( charSet == null ){
  +                   strByte = tmpValue.getBytes();
  +               } else {
  +                   strByte = tmpValue.getBytes(charSet);
  +               }
  +               strIn = new ByteArrayInputStream(strByte);
   
                  DocumentBuilderFactory dfactory = DocumentBuilderFactory.
                      newInstance();
  @@ -199,20 +230,19 @@
                  if (et != null) {
                      docBuilder.setEntityResolver(et);
                  }
  -
                  Document doc = docBuilder.parse(strIn);
   
  -               RemoveUnrecognizedMarkupVisitor rumv =
  -                   new RemoveUnrecognizedMarkupVisitor();
  +               TagRemover tagRemover =
  +                   new TagRemover();
   
                  synchronized(unrecognizedTags){
                      size = unrecognizedTags.size();
                      for (int i = 0; i < size; i++) {
  -                       rumv.addTag( (String) unrecognizedTags.get(i));
  +                       tagRemover.addTag( (String) unrecognizedTags.get(i));
                      }
                  }
  -
  -               doc = rumv.parseDOM(doc);
  +               tagRemover.addTag("o:p");
  +               doc = tagRemover.parseDOM(doc);
   
                  size = DOMVisitors.size();
                  for (int i = 0; i < size; i++) {
  @@ -230,6 +260,9 @@
   
                  serializer.setOutputProperty(OutputKeys.METHOD, "xml");
                  serializer.setOutputProperty(OutputKeys.INDENT, "yes");
  +               if ( charSet != null ){
  +                   serializer.setOutputProperty(OutputKeys.ENCODING, charSet);
  +               }
                  
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount";, "4");
                  //serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, 
"4");
                  strOut.reset();
  @@ -241,7 +274,11 @@
                          "</TIDYERRORS>";
                  }
                  else {
  -                   result = strOut.toString();
  +                   if ( charSet == null ){
  +                       result = strOut.toString();
  +                   } else {
  +                       result = strOut.toString(charSet);
  +                   }
                  }
                  result = JahiaTools.replacePattern(result, AMPERSAND_SECONDPASS,
                      "&");
  

Reply via email to