tdraier 2005/11/09 12:00:37 CET
Modified files:
core/src/java/org/jahia/services/htmlparser
HtmlParserBaseService.java
NekoHtmlParser.java
core/src/webapp/WEB-INF/etc/spring
applicationcontext-basejahiaconfig.xml
Log:
fixed html parser, cleaned neko parser
Revision Changes Path
1.11 +4 -4
jahia/core/src/java/org/jahia/services/htmlparser/HtmlParserBaseService.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/htmlparser/HtmlParserBaseService.java.diff?r1=1.10&r2=1.11&f=h
1.4 +31 -55
jahia/core/src/java/org/jahia/services/htmlparser/NekoHtmlParser.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/htmlparser/NekoHtmlParser.java.diff?r1=1.3&r2=1.4&f=h
1.7 +2 -2
jahia/core/src/webapp/WEB-INF/etc/spring/applicationcontext-basejahiaconfig.xml
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/webapp/WEB-INF/etc/spring/applicationcontext-basejahiaconfig.xml.diff?r1=1.6&r2=1.7&f=h
Index: HtmlParserBaseService.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/htmlparser/HtmlParserBaseService.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- HtmlParserBaseService.java 4 Oct 2005 16:02:38 -0000 1.10
+++ HtmlParserBaseService.java 9 Nov 2005 11:00:36 -0000 1.11
@@ -88,7 +88,7 @@
markupDefinitions =
markupDefinitionManager.getAllMarkupDefinitions();
} catch ( Throwable t ){
- logger.debug("Error loading Markup Settings",t);
+ logger.error("Error loading Markup Settings",t);
markupDefinitions = new Hashtable();
markupSettings = new Hashtable();
}
@@ -369,11 +369,11 @@
}
return parser;
} catch (ClassNotFoundException cnfe) {
- logger.debug(cnfe);
+ logger.error(cnfe);
} catch (InstantiationException ie) {
- logger.debug(ie);
+ logger.error(ie);
} catch (IllegalAccessException iae) {
- logger.debug(iae);
+ logger.error(iae);
}
}
return null;
Index: NekoHtmlParser.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/htmlparser/NekoHtmlParser.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- NekoHtmlParser.java 2 Aug 2005 13:31:57 -0000 1.3
+++ NekoHtmlParser.java 9 Nov 2005 11:00:36 -0000 1.4
@@ -2,6 +2,8 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.StringReader;
+import java.io.StringWriter;
import java.util.Vector;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
@@ -13,6 +15,7 @@
import org.jahia.utils.JahiaTools;
import org.jahia.utils.fileparsers.CharsetDetection;
import org.w3c.dom.Document;
+import org.xml.sax.InputSource;
/**
*
@@ -24,31 +27,31 @@
* @version 1.0
*/
public class NekoHtmlParser implements HtmlParser {
-
+
public static final String AMPERSAND = "$$$amp$$$";
private static final DOMParser domParser = new DOMParser();
-
+
private static final org.apache.log4j.Logger logger =
org.apache.log4j.Logger.getLogger(NekoHtmlParser.class);
-
- private NekoHtmlParser() {
+
+ public NekoHtmlParser() {
try {
domParser.setProperty("http://cyberneko.org/html/properties/names/elems",
"match");
domParser.setProperty("http://cyberneko.org/html/properties/names/attrs",
"no-change");
-
domParser.setProperty("http://cyberneko.org/html/properties/doctype/pubid",
null);
-
domParser.setProperty("http://cyberneko.org/html/properties/doctype/sysid",
null);
+//
domParser.setProperty("http://cyberneko.org/html/properties/doctype/pubid",
null);
+//
domParser.setProperty("http://cyberneko.org/html/properties/doctype/sysid",
null);
} catch (Exception e) {
logger.fatal(e.getMessage(), e);
}
}
-
+
/**
*
* @param htmlParserService HtmlParserService
*/
public void init(final HtmlParserService htmlParserService){
}
-
+
/**
* Parses and generates a clean html document, remove unwanted markups,..
* Using default settings
@@ -60,7 +63,7 @@
public String parse(final String inputString, final Vector DOMVisitors){
return parse(inputString,-1,DOMVisitors);
}
-
+
/**
* Parses and generates a clean html document, remove unwanted markups,..
* Using settings as defined for a given site
@@ -71,13 +74,13 @@
* @return
*/
public String parse(final String inputString, final Vector DOMVisitors,
- final int siteId){
+ final int siteId){
if ( inputString == null || inputString.trim().equals("") ){
return inputString;
}
return parse(inputString,siteId,DOMVisitors);
}
-
+
/**
* Parses and generates a clean html document, remove unwanted markups,..
* Using settings as defined for a given site
@@ -88,74 +91,47 @@
* @return
*/
public static String parse( final String input,
- final int siteId,
- final Vector DOMVisitors){
-
+ final int siteId,
+ final Vector DOMVisitors){
+
if ( input == null || "".equals(input.trim())){
return input;
}
-
+
String result = new String(input);
result = JahiaTools.replacePattern(result, "&", AMPERSAND);
-
- ByteArrayInputStream strIn;
- final ByteArrayOutputStream strOut = new ByteArrayOutputStream();
- byte[] strByte = null;
- String charSet = null; // by default open as ascii
- final CharsetDetection charsetDet = new CharsetDetection();
- try {
- strByte = org.apache.commons.io.IOUtils.toByteArray(result);
- strIn = new ByteArrayInputStream(strByte);
- charsetDet.charsetDetection(strIn);
- charSet = charsetDet.getCharset();
- } catch ( Throwable t ){
- logger.error("Error parsing the document", t);
- }
-
+
int size = 0;
try {
- if ( charSet == null ){
- strByte = result.getBytes();
- } else {
- strByte = result.getBytes(charSet);
- }
- strIn = new ByteArrayInputStream(strByte);
- org.xml.sax.InputSource in = new org.xml.sax.InputSource(strIn);
-
domParser.setProperty("http://cyberneko.org/html/properties/default-encoding",
charSet);
+ InputSource in = new InputSource(new StringReader(input));
domParser.parse(in);
Document doc = domParser.getDocument();
-
+
size = DOMVisitors.size();
for (int i = 0; i <size; i++) {
final HtmlDOMVisitor visitor = (HtmlDOMVisitor)
DOMVisitors.get(i);
doc = visitor.parseDOM(doc);
}
-
+
doc.normalize();
final TransformerFactory tfactory =
TransformerFactory.newInstance();
-
+
// This creates a transformer that does a simple identity
transform,
// and thus can be used for all intents and purposes as a
serializer.
final Transformer serializer = tfactory.newTransformer();
-
+
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
- if ( charSet != null ){
- serializer.setOutputProperty(OutputKeys.ENCODING, charSet);
- }
-
//serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount",
"4");
+
serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "2");
- serializer.transform(new DOMSource(doc),
- new StreamResult(strOut));
- if ( charSet == null ){
- result = strOut.toString();
- } else {
- result = strOut.toString(charSet);
- }
-
+ StringWriter writer = new StringWriter();
+ serializer.transform(new DOMSource(doc), new
StreamResult(writer));
+
+ result = writer.toString();
+
result = JahiaTools.text2XMLEntityRef(result, 1);
result = JahiaTools.replacePattern(result, AMPERSAND, "&");
-
+
} catch ( Throwable t ){
logger.error("Error parsing the document", t);
return input;
Index: applicationcontext-basejahiaconfig.xml
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/webapp/WEB-INF/etc/spring/applicationcontext-basejahiaconfig.xml,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- applicationcontext-basejahiaconfig.xml 24 Oct 2005 12:42:49 -0000
1.6
+++ applicationcontext-basejahiaconfig.xml 9 Nov 2005 11:00:36 -0000
1.7
@@ -21,10 +21,10 @@
<bean id="jahiaConfigBean" class="org.jahia.bin.JahiaConfig">
<property name="settings">
<props>
+ <!--
<prop
key="org.jahia.services.htmlparser.HtmlParser">org.jahia.services.htmlparser.NekoHtmlParser</prop>
- <!--
- <prop
key="org.jahia.services.htmlparser.HtmlParser">org.jahia.services.htmlparser.TidyHtmlParser</prop>
-->
+ <prop
key="org.jahia.services.htmlparser.HtmlParser">org.jahia.services.htmlparser.TidyHtmlParser</prop>
</props>
</property>
</bean>