knguyen 2004/10/08 18:38:12 CEST
Modified files: (Branch: JAHIA-4-0-BRANCH)
src/java/org/jahia/services/htmlparser TidyHtmlParser.java
Log:
By default, make tidy remove all unknown tags, cleaning world as well.
If we want it to not remove a certain type of unknown tags, we alway can declare
them in tidy config file.
Revision Changes Path
1.8.2.3 +185 -91 jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/services/htmlparser/TidyHtmlParser.java.diff?r1=1.8.2.2&r2=1.8.2.3&f=h
Index: TidyHtmlParser.java
===================================================================
RCS file:
/home/cvs/repository/jahia/src/java/org/jahia/services/htmlparser/Attic/TidyHtmlParser.java,v
retrieving revision 1.8.2.2
retrieving revision 1.8.2.3
diff -u -r1.8.2.2 -r1.8.2.3
--- TidyHtmlParser.java 5 Oct 2004 19:07:02 -0000 1.8.2.2
+++ TidyHtmlParser.java 8 Oct 2004 16:38:11 -0000 1.8.2.3
@@ -51,6 +51,10 @@
public static String AMPERSAND_SECONDPASS = "$$$amp_secondpass$$$";
public static String TIDYERRORS_TAG = "TIDYERRORS";
+ private static Vector newInlineTags = new Vector();
+ private static Vector newBlockLevelTags = new Vector();
+ private static Vector unrecognizedTags = new Vector();
+
private Properties config = new Properties();
public TidyHtmlParser(){}
@@ -123,99 +127,189 @@
newInlineTags = JahiaTextContentTidy.JAHIA_HTML_TAG_NAME + ", " +
newInlineTags;
config.setProperty(TidyConfig.NEW_INLINE_TAGS,newInlineTags);
*/
- tidy.setConfigurationFromProps(tidyConfig);
-
- try {
-
- byte strByte[] = result.getBytes();
- strIn = new ByteArrayInputStream(strByte);
- strOut = new ByteArrayOutputStream();
- ByteArrayOutputStream strErr = new ByteArrayOutputStream();
- tidy.setErrout(new PrintWriter( strErr, true));
- tidy.setShowWarnings( false );
- tidy.parse(strIn, strOut);
-
- strIn.reset();
- String tmpValue = strOut.toString();
- tmpValue = JahiaTools.replacePattern(tmpValue,"&",AMPERSAND_SECONDPASS);
-
- if ( tmpValue == null ){
- tmpValue = "";
- }
- if ( !"".equals(tmpValue.trim()) ){
- strIn = new ByteArrayInputStream(tmpValue.getBytes());
- DocumentBuilderFactory dfactory =
DocumentBuilderFactory.newInstance();
+ Properties config = (Properties) tidyConfig.clone();
+ String val = tidyConfig.getProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS);
+ if (val == null) {
+ val = "";
+ }
+ String tag = null;
+ int size = newBlockLevelTags.size();
+ for (int i = 0; i < size; i++) {
+ tag = (String) newBlockLevelTags.get(i);
+ if (val.length() == 0) {
+ val = tag;
+ }
+ else {
+ val += "," + tag;
+ }
+ }
+ config.setProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS, val);
+
+ val = config.getProperty(TidyConfig.NEW_INLINE_TAGS);
+ if (val == null) {
+ val = "";
+ }
+ size = newInlineTags.size();
+ for (int i = 0; i < size; i++) {
+ tag = (String) newInlineTags.get(i);
+ if (val.length() == 0) {
+ val = tag;
+ }
+ else {
+ val += "," + tag;
+ }
+ }
+ config.setProperty(TidyConfig.NEW_INLINE_TAGS, val);
+
+ tidy.setConfigurationFromProps(config);
+
+ try {
+
+ byte strByte[] = result.getBytes();
+ strIn = new ByteArrayInputStream(strByte);
+ strOut = new ByteArrayOutputStream();
+ ByteArrayOutputStream strErr = new ByteArrayOutputStream();
+ tidy.setErrout(new PrintWriter(strErr, true));
+ tidy.setShowWarnings(false);
+ tidy.parse(strIn, strOut);
+
+ strIn.reset();
+ String tmpValue = strOut.toString();
+ tmpValue = JahiaTools.replacePattern(tmpValue, "&",
+ AMPERSAND_SECONDPASS);
+
+ if (tmpValue == null) {
+ tmpValue = "";
+ }
+ if (!"".equals(tmpValue.trim())) {
+ strIn = new ByteArrayInputStream(tmpValue.getBytes());
+
+ DocumentBuilderFactory dfactory = DocumentBuilderFactory.
+ newInstance();
+
+ EntityResolver et = null;
+ try {
+ et = ServicesRegistry.getInstance().
+ getJahiaWebAppsDeployerService().getDtdEntityResolver();
+ }
+ catch (Throwable t) {
+ }
+ DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
+ if (et != null) {
+ docBuilder.setEntityResolver(et);
+ }
+
+ Document doc = docBuilder.parse(strIn);
+
+ RemoveUnrecognizedMarkupVisitor rumv =
+ new RemoveUnrecognizedMarkupVisitor();
+
+ synchronized(unrecognizedTags){
+ size = unrecognizedTags.size();
+ for (int i = 0; i < size; i++) {
+ rumv.addTag( (String) unrecognizedTags.get(i));
+ }
+ }
+
+ doc = rumv.parseDOM(doc);
+
+ size = DOMVisitors.size();
+ for (int i = 0; i < size; i++) {
+ HtmlDOMVisitor visitor = (HtmlDOMVisitor) DOMVisitors.get(i);
+ doc = visitor.parseDOM(doc);
+ }
+
+ doc.normalize();
+
+ TransformerFactory tfactory = TransformerFactory.newInstance();
+
+ // This creates a transformer that does a simple identity transform,
+ // and thus can be used for all intents and purposes as a serializer.
+ Transformer serializer = tfactory.newTransformer();
+
+ serializer.setOutputProperty(OutputKeys.METHOD, "xml");
+ serializer.setOutputProperty(OutputKeys.INDENT, "yes");
+
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
+ //serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT,
"4");
+ strOut.reset();
+ serializer.transform(new DOMSource(doc),
+ new StreamResult(strOut));
+
+ if (tidy.getParseErrors() > 0) {
+ result = "<TIDYERRORS>\n" + strErr.toString() +
+ "</TIDYERRORS>";
+ }
+ else {
+ result = strOut.toString();
+ }
+ result = JahiaTools.replacePattern(result, AMPERSAND_SECONDPASS,
+ "&");
+ result = JahiaTools.text2XMLEntityRef(result, 1);
+ result = JahiaTools.replacePattern(result, AMPERSAND, "&");
+
+ }
+ else if (tidy.getParseErrors() > 0) {
+ String err = strErr.toString();
+ result = "<TIDYERRORS>\n" + err + "</TIDYERRORS>";
+ if (err.indexOf("is not recognized!") != -1) {
+ err = JahiaTools.replacePatternIgnoreCase(err.toLowerCase(),
+ " - error: ", "@@@");
+ String[] errors = org.jahia.utils.JahiaTools.getTokens(
+ err, "@@@");
+ if (errors.length > 0) {
+ String token = "";
+ ArrayList tags = new ArrayList();
+ tag = null;
+ String newInput = input;
+ int pos = -1;
+ for (int i = 0; i < errors.length; i++) {
+ token = errors[i];
+ pos = token.indexOf(" is not recognized!");
+ if (pos != -1) {
+ try {
+ tag = token.substring(0,pos);
+ if ( !tag.startsWith("<") ){
+ // we found an unknown empty tag
+ synchronized(unrecognizedTags){
+ if (unrecognizedTags.contains(tag)) {
+ continue;
+ }
+ else {
+ unrecognizedTags.add(tag);
+ newInlineTags.add(tag);
+ }
+ }
+ } else {
+ tag = tag.substring(1, tag.length() - 1);
+ synchronized(unrecognizedTags){
+ if (unrecognizedTags.contains(tag)) {
+ continue;
+ }
+ else {
+ unrecognizedTags.add(tag);
+ newBlockLevelTags.add(tag);
+ }
+ }
+ }
+ }
+ catch (Throwable t) {
+ }
+ }
+ }
+ result = parse(input, siteId, tidyConfig,
+ DOMVisitors);
+ }
+ }
+ }
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ return input;
+ }
- EntityResolver et = null;
- try {
- et =
ServicesRegistry.getInstance().getJahiaWebAppsDeployerService().getDtdEntityResolver();
- } catch ( Throwable t ){
- }
- DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
- if ( et != null ){
- docBuilder.setEntityResolver(et);
- }
-
- Document doc = docBuilder.parse(strIn);
-
- int size = DOMVisitors.size();
- for ( int i=0 ; i<size; i++ ){
- HtmlDOMVisitor visitor = (HtmlDOMVisitor)DOMVisitors.get(i);
- doc = visitor.parseDOM(doc);
- }
-
- doc.normalize();
-
- TransformerFactory tfactory = TransformerFactory.newInstance();
-
- // This creates a transformer that does a simple identity transform,
- // and thus can be used for all intents and purposes as a
serializer.
- Transformer serializer = tfactory.newTransformer();
-
- serializer.setOutputProperty(OutputKeys.METHOD, "xml");
- serializer.setOutputProperty(OutputKeys.INDENT, "yes");
-
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
-
//serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "4");
- strOut.reset();
- serializer.transform (new DOMSource(doc),
- new StreamResult(strOut));
-
- if ( tidy.getParseErrors() > 0 ) {
- result = "<TIDYERRORS>\n" + strErr.toString() + "</TIDYERRORS>";
- } else {
- result = strOut.toString();
- }
- result = JahiaTools.replacePattern(result,AMPERSAND_SECONDPASS,"&");
- result = JahiaTools.text2XMLEntityRef(result,1);
- result = JahiaTools.replacePattern(result,AMPERSAND,"&");
-
- } else if ( tidy.getParseErrors() > 0 ){
- result = "<TIDYERRORS>\n" + strErr.toString() + "</TIDYERRORS>";
- result = JahiaTools.replacePatternIgnoreCase(result," is not
recognized!","@@@");
- String[] errors =
org.jahia.utils.JahiaTools.getTokens(result,"@@@");
- if ( errors.length>0 ){
- String token = "";
- ArrayList tags = new ArrayList();
- String tag = null;
- String newInput = input;
- for (int i = 0; i < errors.length; i++) {
- token = errors[i];
- tag = token.substring(token.lastIndexOf("<"),
- token.lastIndexOf(">"));
- newInput =
JahiaTools.replacePatternIgnoreCase(newInput,tag,"$$$notrecognizedtag$$$"+tag.substring(1));
- }
- result = parse( newInput, siteId, tidyConfig, DOMVisitors);
- result = JahiaTools.replacePatternIgnoreCase
(result,"$$$notrecognizedtag$$$","<");
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- return input;
- }
-
- return result.trim();
- }
+ return result.trim();
+ }
/**
* Clone a Tidy DOM Tree and return a JAXP 2 DOM Tree