knguyen 2004/10/13 14:16:19 CEST
Modified files: (Branch: JAHIA-4-0-BRANCH)
src/java/org/jahia/services/htmlparser TidyConfig.java
TidyHtmlParser.java
Added files: (Branch: JAHIA-4-0-BRANCH)
src/java/org/jahia/services/htmlparser TagRemover.java
Log:
- tidy and charset issue
Revision Changes Path
1.1.2.1 +125 -0 jahia/src/java/org/jahia/services/htmlparser/TagRemover.java
(new)
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TagRemover.java?rev=1.1.2.1&content-type=text/plain
1.1.4.2 +1 -0 jahia/src/java/org/jahia/services/htmlparser/TidyConfig.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TidyConfig.java.diff?r1=1.1.4.1&r2=1.1.4.2&f=h
1.8.2.4 +48 -11 jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java.diff?r1=1.8.2.3&r2=1.8.2.4&f=h
Index: TidyConfig.java
===================================================================
RCS file:
/home/cvs/repository/jahia/src/java/org/jahia/services/htmlparser/Attic/TidyConfig.java,v
retrieving revision 1.1.4.1
retrieving revision 1.1.4.2
diff -u -r1.1.4.1 -r1.1.4.2
--- TidyConfig.java 8 Oct 2004 16:38:18 -0000 1.1.4.1
+++ TidyConfig.java 13 Oct 2004 12:16:19 -0000 1.1.4.2
@@ -124,6 +124,7 @@
public class TidyConfig {
+ public static final String CHAR_ENCODING = "char-encoding";
public static final String NEW_INLINE_TAGS = "new-inline-tags";
public static final String NEW_EMTY_TAGS = "new-empty-tags";
public static final String NEW_BLOCK_LEVEL_TAGS = "new-blocklevel-tags";
Index: TidyHtmlParser.java
===================================================================
RCS file:
/home/cvs/repository/jahia/src/java/org/jahia/services/htmlparser/Attic/TidyHtmlParser.java,v
retrieving revision 1.8.2.3
retrieving revision 1.8.2.4
diff -u -r1.8.2.3 -r1.8.2.4
--- TidyHtmlParser.java 8 Oct 2004 16:38:11 -0000 1.8.2.3
+++ TidyHtmlParser.java 13 Oct 2004 12:16:19 -0000 1.8.2.4
@@ -35,6 +35,7 @@
import org.jahia.utils.JahiaTools;
import org.jahia.utils.TextHtml;
import java.util.ArrayList;
+import org.jahia.utils.fileparsers.CharsetDetection;
/**
*
@@ -110,6 +111,7 @@
if ( input == null || "".equals(input.trim())){
return input;
}
+
String result = new String(input);
result = JahiaTools.replacePattern(result,"&",AMPERSAND);
@@ -160,13 +162,32 @@
val += "," + tag;
}
}
+
config.setProperty(TidyConfig.NEW_INLINE_TAGS, val);
+ // charset
+ byte[] strByte = null;
+ String charSet = null; // by default open as ascii
+ CharsetDetection charsetDet = new CharsetDetection();
+ try {
+ strByte = org.apache.commons.io.IOUtils.toByteArray(result);
+ strIn = new ByteArrayInputStream(strByte);
+ charsetDet.charsetDetection(strIn);
+ charSet = charsetDet.getCharset();
+ if ( charSet != null && "UTF-8".equalsIgnoreCase(charSet) ){
+ config.setProperty(TidyConfig.CHAR_ENCODING,"utf8");
+ }
+ } catch ( Throwable t ){
+ }
+
tidy.setConfigurationFromProps(config);
try {
-
- byte strByte[] = result.getBytes();
+ if ( charSet == null ){
+ strByte = result.getBytes();
+ } else {
+ strByte = result.getBytes(charSet);
+ }
strIn = new ByteArrayInputStream(strByte);
strOut = new ByteArrayOutputStream();
ByteArrayOutputStream strErr = new ByteArrayOutputStream();
@@ -175,7 +196,12 @@
tidy.parse(strIn, strOut);
strIn.reset();
- String tmpValue = strOut.toString();
+ String tmpValue = null;
+ if ( charSet == null ){
+ tmpValue = strOut.toString();
+ } else {
+ tmpValue = strOut.toString(charSet);
+ }
tmpValue = JahiaTools.replacePattern(tmpValue, "&",
AMPERSAND_SECONDPASS);
@@ -183,7 +209,12 @@
tmpValue = "";
}
if (!"".equals(tmpValue.trim())) {
- strIn = new ByteArrayInputStream(tmpValue.getBytes());
+ if ( charSet == null ){
+ strByte = tmpValue.getBytes();
+ } else {
+ strByte = tmpValue.getBytes(charSet);
+ }
+ strIn = new ByteArrayInputStream(strByte);
DocumentBuilderFactory dfactory = DocumentBuilderFactory.
newInstance();
@@ -199,20 +230,19 @@
if (et != null) {
docBuilder.setEntityResolver(et);
}
-
Document doc = docBuilder.parse(strIn);
- RemoveUnrecognizedMarkupVisitor rumv =
- new RemoveUnrecognizedMarkupVisitor();
+ TagRemover tagRemover =
+ new TagRemover();
synchronized(unrecognizedTags){
size = unrecognizedTags.size();
for (int i = 0; i < size; i++) {
- rumv.addTag( (String) unrecognizedTags.get(i));
+ tagRemover.addTag( (String) unrecognizedTags.get(i));
}
}
-
- doc = rumv.parseDOM(doc);
+ tagRemover.addTag("o:p");
+ doc = tagRemover.parseDOM(doc);
size = DOMVisitors.size();
for (int i = 0; i < size; i++) {
@@ -230,6 +260,9 @@
serializer.setOutputProperty(OutputKeys.METHOD, "xml");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
+ if ( charSet != null ){
+ serializer.setOutputProperty(OutputKeys.ENCODING, charSet);
+ }
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
//serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT,
"4");
strOut.reset();
@@ -241,7 +274,11 @@
"</TIDYERRORS>";
}
else {
- result = strOut.toString();
+ if ( charSet == null ){
+ result = strOut.toString();
+ } else {
+ result = strOut.toString(charSet);
+ }
}
result = JahiaTools.replacePattern(result, AMPERSAND_SECONDPASS,
"&");