Author: wkasper
Date: Fri Jul 22 08:11:22 2011
New Revision: 1149493
URL: http://svn.apache.org/viewvc?rev=1149493&view=rev
Log:
Stanbol-212: Enable use of other configurations and extractors
Moved all patches to the DomBuilder of HtmlCleaner-2.1 to a new subclass to
remove the dependency on a patch release.
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/README.md?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/README.md (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/README.md Fri Jul 22
08:11:22 2011
@@ -270,6 +270,18 @@ The following table describes which voca
</tr>
</table>
+## Configuration options
+
+By default, Metaxa uses the extractors specified in the resource
"extractionregistry.xml", and for HTML pages, the resource "htmlregistry.xml".
+Alternative configurations and extractors can be attached to Metaxa as
fragment bundles, specifying as host bundle
+
+ Fragment-Host: org.apache.stanbol.enhancer.engines.metaxa
+
+The alternative configuration files then can be set as values of the properties
+
+*
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.extractionregistry</pre></code>
+
+*
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.htmlextractors</pre></code>
## Usage
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
Fri Jul 22 08:11:22 2011
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import java.io.IOException;
import java.util.Collections;
+import java.util.Dictionary;
import java.util.HashMap;
import java.util.Map;
@@ -32,10 +33,12 @@ import org.apache.clerezza.rdf.core.impl
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl;
import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.engines.metaxa.core.MetaxaCore;
import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils;
import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver;
+import
org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
@@ -50,6 +53,7 @@ import org.ontoware.rdf2go.model.node.Da
import org.ontoware.rdf2go.model.node.Node;
import org.ontoware.rdf2go.model.node.PlainLiteral;
import org.ontoware.rdf2go.model.node.URI;
+import org.osgi.framework.BundleContext;
import org.osgi.service.component.ComponentContext;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.slf4j.Logger;
@@ -62,13 +66,12 @@ import org.slf4j.LoggerFactory;
* @author Joerg Steffen, DFKI
* @version $Id$
*/
-@Component(immediate = true, metatype = true)
+@Component(immediate = true, metatype = true,
+ label="Apache Stanbol Text and Metadata Extraction Engine",
+ description="Extract plain text and embedded metadata form various
document types and formats")
@Service
public class MetaxaEngine implements EnhancementEngine, ServiceProperties {
- /**
- * This contains the logger.
- */
private static final Logger log =
LoggerFactory.getLogger(MetaxaEngine.class);
/**
@@ -78,10 +81,27 @@ public class MetaxaEngine implements Enh
public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
/**
- * This contains the Aperture extractor.
+ * name of a file defining the available docuemnt extractors for Metaxa.
By defualt, the builtin file 'extractionregistry.xml' is used.
+ */
+ @Property(label="ExtractorRegistry",
+ description="The path of a resource on the bundle classpath that
specifies which extractors to use.",
+ value="extractionregistry.xml")
+ public static final String GLOBAL_EXTRACTOR_REGISTRY =
"org.apache.stanbol.enhancer.engines.metaxa.extractionregistry";
+
+ /**
+ * name of a file that defines the set of extractors for HTML documents.
By default, the builtin file 'htmlextractors.xml' is used."
*/
+ @Property(label="HtmlExtractors",value="htmlextractors.xml",
+ description="The path of a resource on the bundle classpath that
specifies which extractors are used for HTML pages.")
+ public static final String HTML_EXTRACTOR_REGISTRY =
"org.apache.stanbol.enhancer.engines.metaxa.htmlextractors";
+
private MetaxaCore extractor;
+
+ BundleContext bundleContext;
+ public static final String DEFAULT_EXTRACTION_REGISTRY =
"extractionregistry.xml";
+ public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY =
"htmlextractors.xml";
+
/**
* The activate method.
*
@@ -89,13 +109,27 @@ public class MetaxaEngine implements Enh
* @throws IOException if initializing fails
*/
protected void activate(ComponentContext ce) throws IOException {
-
- try {
- this.extractor = new MetaxaCore("extractionregistry.xml");
- BundleURIResolver.BUNDLE = ce.getBundleContext().getBundle();
- } catch (IOException e) {
- log.error(e.getLocalizedMessage(), e);
- throw e;
+ String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY;
+ String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
+ if (ce != null) {
+ this.bundleContext = ce.getBundleContext();
+ BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
+ try {
+ Dictionary<String, String> properties = ce.getProperties();
+ String confFile = properties.get(GLOBAL_EXTRACTOR_REGISTRY);
+ if (confFile != null && confFile.trim().length() > 0) {
+ extractionRegistry = confFile;
+ }
+ confFile = properties.get(HTML_EXTRACTOR_REGISTRY);
+ if (confFile != null && confFile.trim().length() > 0) {
+ htmlExtractors = confFile;
+ }
+ this.extractor = new MetaxaCore(extractionRegistry);
+ HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
+ } catch (IOException e) {
+ log.error(e.getLocalizedMessage(), e);
+ throw e;
+ }
}
}
@@ -119,13 +153,6 @@ public class MetaxaEngine implements Enh
public void computeEnhancements(ContentItem ci) throws EngineException {
try {
- // get the model where to add the statements
- MGraph g = ci.getMetadata();
- // create enhancement
- UriRef textEnhancement =
EnhancementEngineHelper.createTextEnhancement(ci, this);
- // set confidence value to 1.0
- LiteralFactory literalFactory = LiteralFactory.getInstance();
- g.add(new TripleImpl(textEnhancement,
Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
// get model from the extraction
Model m = this.extractor.extract(ci.getStream(), ci.getId(),
ci.getMimeType());
// add the statements from this model to the Metadata model
@@ -134,6 +161,13 @@ public class MetaxaEngine implements Enh
String text = MetaxaCore.getText(m);
log.info(text);
*/
+ // get the model where to add the statements
+ MGraph g = ci.getMetadata();
+ // create enhancement
+ UriRef textEnhancement =
EnhancementEngineHelper.createTextEnhancement(ci, this);
+ // set confidence value to 1.0
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+ g.add(new TripleImpl(textEnhancement,
Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
RDF2GoUtils.urifyBlankNodes(m);
HashMap<BlankNode, BNode> blankNodeMap = new
HashMap<BlankNode, BNode>();
ClosableIterator<Statement> it = m.iterator();
@@ -147,10 +181,11 @@ public class MetaxaEngine implements Enh
if (null != subject && null != predicate && null !=
object) {
Triple t = new TripleImpl(subject, predicate, object);
g.add(t);
- log.info("added " + t.toString());
+ log.debug("added " + t.toString());
}
}
it.close();
+ m.close();
}
} catch (ExtractorException e) {
throw new EngineException(e.getLocalizedMessage(), e);
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java?rev=1149493&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
Fri Jul 22 08:11:22 2011
@@ -0,0 +1,375 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.html;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.CommentToken;
+import org.htmlcleaner.ContentToken;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.TagNode;
+import org.htmlcleaner.Utils;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+/**
+ * Patches for HtmlCleaner-2.1 for namespace handling and correcting XML
serialization.
+ * The patches are not applicable for HtmlCleaner-2.2 that suffers from
losing namespaces altogether.
+ *
+ * @author <a href="mailto:[email protected]">Walter Kasper</a>
+ *
+ */
+
+public class DomSerializer2 extends DomSerializer {
+
+ public DomSerializer2(CleanerProperties props, boolean escapeXml) {
+ super(props,escapeXml);
+ }
+
+ public DomSerializer2(CleanerProperties props) {
+ this(props, true);
+ }
+
+ public Document createDOM(TagNode rootNode) throws
ParserConfigurationException {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setNamespaceAware(props.isNamespacesAware());
+ Document document = factory.newDocumentBuilder().newDocument();
+ Element rootElement = document.createElement(rootNode.getName());;
+ document.appendChild(rootElement);
+ setAttributes(rootNode, rootElement);
+
+ createSubnodes(document, rootElement, rootNode.getChildren());
+
+ return document;
+}
+
+ private void setAttributes(TagNode node, Element element) {
+ Map<?,?> attributes = node.getAttributes();
+ Iterator<?> entryIterator = attributes.entrySet().iterator();
+ while (entryIterator.hasNext()) {
+ Map.Entry<?,?> entry = (Map.Entry<?,?>) entryIterator.next();
+ String attrName = (String)entry.getKey();
+ String attrValue = (String)entry.getValue();
+ if (escapeXml) {
+ attrValue = escapeXml(attrValue, props, true);
+ }
+ // avoid xhtml declarations
+ if (!attrName.equals("xmlns")) {
+ element.setAttribute(attrName, attrValue);
+ }
+ }
+ }
+
+ private void createSubnodes(Document document, Element element, List
tagChildren) {
+ if (tagChildren != null) {
+ Iterator it = tagChildren.iterator();
+ while (it.hasNext()) {
+ Object item = it.next();
+ if (item instanceof CommentToken) {
+ CommentToken commentNode = (CommentToken) item;
+ Comment comment = document.createComment(
commentNode.getContent().toString() );
+ element.appendChild(comment);
+ } else if (item instanceof ContentToken) {
+ ContentToken contentToken = (ContentToken) item;
+ String content = contentToken.getContent();
+ String nodeName = element.getNodeName();
+ boolean specialCase = props.isUseCdataForScriptAndStyle() &&
+ ("script".equalsIgnoreCase(nodeName) ||
"style".equalsIgnoreCase(nodeName));
+ if (escapeXml && !specialCase) {
+ content = escapeXml(content, props, true);
+ }
+ element.appendChild( specialCase ?
document.createCDATASection(content) : document.createTextNode(content) );
+ } else if (item instanceof TagNode) {
+ TagNode subTagNode = (TagNode) item;
+ Element subelement = document.createElement( subTagNode.getName() );;
+
+ setAttributes(subTagNode, subelement);
+
+ // recursively create subnodes
+ createSubnodes(document, subelement, subTagNode.getChildren());
+
+ element.appendChild(subelement);
+ } else if (item instanceof List) {
+ List sublist = (List) item;
+ createSubnodes(document, element, sublist);
+ }
+ }
+ }
+ }
+
+ /**
+ * Escapes XML string.
+ * @param s String to be escaped
+ * @param props Cleaner properties gover affect escaping behaviour
+ * @param isDomCreation Tells if escaped content will be part of the DOM
+ */
+ public static String escapeXml(String s, CleanerProperties props, boolean
isDomCreation) {
+ boolean advanced = props.isAdvancedXmlEscape();
+ boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
+ boolean translateSpecialEntities = props.isTranslateSpecialEntities();
+
+ if (s != null) {
+ int len = s.length();
+ StringBuffer result = new StringBuffer(len);
+
+ for (int i = 0; i < len; i++) {
+ char ch = s.charAt(i);
+
+ if (ch == '&') {
+ if ( (advanced || recognizeUnicodeChars) && (i < len-1) &&
(s.charAt(i+1) == '#') ) {
+ int charIndex = i + 2;
+ String unicode = "";
+ while ( charIndex < len &&
+ (Utils.isHexadecimalDigit(s.charAt(charIndex))
|| s.charAt(charIndex) == 'x' || s.charAt(charIndex) == 'X')
+ ) {
+ unicode += s.charAt(charIndex);
+ charIndex++;
+ }
+ if (charIndex == len || !"".equals(unicode)) {
+ try {
+ char unicodeChar = unicode.toLowerCase().startsWith("x") ?
+
(char)Integer.parseInt(unicode.substring(1), 16) :
+
(char)Integer.parseInt(unicode);
+// if ( "&<>\'\"".indexOf(unicodeChar) < 0 ) {
+ int replaceChunkSize = (charIndex < len &&
s.charAt(charIndex) == ';') ? unicode.length()+1 : unicode.length();
+ result.append( recognizeUnicodeChars ?
String.valueOf(unicodeChar) : "&#" + unicode + ";" );
+ i += replaceChunkSize + 1;
+// } else {
+// i = charIndex;
+// result.append("&#" + unicode + ";");
+// }
+ } catch (NumberFormatException e) {
+ i = charIndex;
+ result.append("&#" + unicode + ";");
+ }
+ } else {
+ result.append("&");
+ }
+ } else {
+ if (translateSpecialEntities) {
+ // get following sequence of most 10 characters
+ String seq = s.substring(i, i+Math.min(10, len-i));
+ int semiIndex = seq.indexOf(';');
+ if (semiIndex > 0) {
+ String entity = seq.substring(1, semiIndex);
+ Integer code = entities.get(entity);
+ if (code != null) {
+ int entityLen = entity.length();
+ if (recognizeUnicodeChars) {
+ result.append( (char)code.intValue() );
+ } else {
+ result.append( "&#" + code + ";" );
+ }
+ i += entityLen + 1;
+ continue;
+ }
+ }
+ }
+
+ if (advanced) {
+ String sub = s.substring(i);
+ if ( sub.startsWith("&") ) {
+ result.append(isDomCreation ? "&" : "&");
+ i += 4;
+ } else if ( sub.startsWith("'") ) {
+ result.append(isDomCreation ? "'" : "'");
+ i += 5;
+ } else if ( sub.startsWith(">") ) {
+ result.append(isDomCreation ? ">" : ">");
+ i += 3;
+ } else if ( sub.startsWith("<") ) {
+ result.append(isDomCreation ? "<" : "<");
+ i += 3;
+ } else if ( sub.startsWith(""") ) {
+ result.append(isDomCreation ? "\"" : """);
+ i += 5;
+ } else {
+ result.append(isDomCreation ? "&" : "&");
+ }
+
+ continue;
+ }
+
+ result.append("&");
+ }
+ }
+ else if (!isDomCreation) {
+ if (ch == '\'') {
+ result.append("'");
+ } else if (ch == '>') {
+ result.append(">");
+ } else if (ch == '<') {
+ result.append("<");
+ } else if (ch == '\"') {
+ result.append(""");
+ }
+ }
+ else {
+ result.append(ch);
+ }
+ }
+ return result.toString();
+ }
+
+ return null;
+ }
+
+ // copied from SpecialEntities class because map is not visible only package
internal
+ public static Map<String,Integer> entities = new HashMap<String,Integer>();
+
+ static {
+ entities.put("nbsp", new Integer(160));
+ entities.put("iexcl", new Integer(161));
+ entities.put("curren", new Integer(164));
+ entities.put("cent", new Integer(162));
+ entities.put("pound", new Integer(163));
+ entities.put("yen", new Integer(165));
+ entities.put("brvbar", new Integer(166));
+ entities.put("sect", new Integer(167));
+ entities.put("uml", new Integer(168));
+ entities.put("copy", new Integer(169));
+ entities.put("ordf", new Integer(170));
+ entities.put("laquo", new Integer(171));
+ entities.put("not", new Integer(172));
+ entities.put("shy", new Integer(173));
+ entities.put("reg", new Integer(174));
+ entities.put("trade", new Integer(8482));
+ entities.put("macr", new Integer(175));
+ entities.put("deg", new Integer(176));
+ entities.put("plusmn", new Integer(177));
+ entities.put("sup2", new Integer(178));
+ entities.put("sup3", new Integer(179));
+ entities.put("acute", new Integer(180));
+ entities.put("micro", new Integer(181));
+ entities.put("para", new Integer(182));
+ entities.put("middot", new Integer(183));
+ entities.put("cedil", new Integer(184));
+ entities.put("sup1", new Integer(185));
+ entities.put("ordm", new Integer(186));
+ entities.put("raquo", new Integer(187));
+ entities.put("frac14", new Integer(188));
+ entities.put("frac12", new Integer(189));
+ entities.put("frac34", new Integer(190));
+ entities.put("iquest", new Integer(191));
+ entities.put("times", new Integer(215));
+ entities.put("divide", new Integer(247));
+
+ entities.put("Agrave", new Integer(192));
+ entities.put("Aacute", new Integer(193));
+ entities.put("Acirc", new Integer(194));
+ entities.put("Atilde", new Integer(195));
+ entities.put("Auml", new Integer(196));
+ entities.put("Aring", new Integer(197));
+ entities.put("AElig", new Integer(198));
+ entities.put("Ccedil", new Integer(199));
+ entities.put("Egrave", new Integer(200));
+ entities.put("Eacute", new Integer(201));
+ entities.put("Ecirc", new Integer(202));
+ entities.put("Euml", new Integer(203));
+ entities.put("Igrave", new Integer(204));
+ entities.put("Iacute", new Integer(205));
+ entities.put("Icirc", new Integer(206));
+ entities.put("Iuml", new Integer(207));
+ entities.put("ETH", new Integer(208));
+ entities.put("Ntilde", new Integer(209));
+ entities.put("Ograve", new Integer(210));
+ entities.put("Oacute", new Integer(211));
+ entities.put("Ocirc", new Integer(212));
+ entities.put("Otilde", new Integer(213));
+ entities.put("Ouml", new Integer(214));
+ entities.put("Oslash", new Integer(216));
+ entities.put("Ugrave", new Integer(217));
+ entities.put("Uacute", new Integer(218));
+ entities.put("Ucirc", new Integer(219));
+ entities.put("Uuml", new Integer(220));
+ entities.put("Yacute", new Integer(221));
+ entities.put("THORN", new Integer(222));
+ entities.put("szlig", new Integer(223));
+ entities.put("agrave", new Integer(224));
+ entities.put("aacute", new Integer(225));
+ entities.put("acirc", new Integer(226));
+ entities.put("atilde", new Integer(227));
+ entities.put("auml", new Integer(228));
+ entities.put("aring", new Integer(229));
+ entities.put("aelig", new Integer(230));
+ entities.put("ccedil", new Integer(231));
+ entities.put("egrave", new Integer(232));
+ entities.put("eacute", new Integer(233));
+ entities.put("ecirc", new Integer(234));
+ entities.put("euml", new Integer(235));
+ entities.put("igrave", new Integer(236));
+ entities.put("iacute", new Integer(237));
+ entities.put("icirc", new Integer(238));
+ entities.put("iuml", new Integer(239));
+ entities.put("eth", new Integer(240));
+ entities.put("ntilde", new Integer(241));
+ entities.put("ograve", new Integer(242));
+ entities.put("oacute", new Integer(243));
+ entities.put("ocirc", new Integer(244));
+ entities.put("otilde", new Integer(245));
+ entities.put("ouml", new Integer(246));
+ entities.put("oslash", new Integer(248));
+ entities.put("ugrave", new Integer(249));
+ entities.put("uacute", new Integer(250));
+ entities.put("ucirc", new Integer(251));
+ entities.put("uuml", new Integer(252));
+ entities.put("yacute", new Integer(253));
+ entities.put("thorn", new Integer(254));
+ entities.put("yuml", new Integer(255));
+
+ entities.put("OElig", new Integer(338));
+ entities.put("oelig", new Integer(339));
+ entities.put("Scaron", new Integer(352));
+ entities.put("scaron", new Integer(353));
+ entities.put("Yuml", new Integer(376));
+ entities.put("circ", new Integer(710));
+ entities.put("tilde", new Integer(732));
+ entities.put("ensp", new Integer(8194));
+ entities.put("emsp", new Integer(8195));
+ entities.put("thinsp", new Integer(8201));
+ entities.put("zwnj", new Integer(8204));
+ entities.put("zwj", new Integer(8205));
+ entities.put("lrm", new Integer(8206));
+ entities.put("rlm", new Integer(8207));
+ entities.put("ndash", new Integer(8211));
+ entities.put("mdash", new Integer(8212));
+ entities.put("lsquo", new Integer(8216));
+ entities.put("rsquo", new Integer(8217));
+ entities.put("sbquo", new Integer(8218));
+ entities.put("ldquo", new Integer(8220));
+ entities.put("rdquo", new Integer(8221));
+ entities.put("bdquo", new Integer(8222));
+ entities.put("dagger", new Integer(8224));
+ entities.put("Dagger", new Integer(8225));
+ entities.put("hellip", new Integer(8230));
+ entities.put("permil", new Integer(8240));
+ entities.put("lsaquo", new Integer(8249));
+ entities.put("rsaquo", new Integer(8250));
+ entities.put("euro", new Integer(8364));
+ }
+
+
+}
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
Fri Jul 22 08:11:22 2011
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
@@ -66,10 +67,20 @@ public class HtmlExtractionRegistry {
public HtmlExtractionRegistry(String configFileName)
throws InitializationException {
this();
- initialize(configFileName);
+ InputStream config =
getClass().getClassLoader().getResourceAsStream(configFileName);
+ if (config == null) {
+ throw new InitializationException("File not found:
"+configFileName);
+ }
+ initialize(config);
}
- public void initialize(String configFileName)
+ public HtmlExtractionRegistry(InputStream config) throws
InitializationException {
+ this();
+ initialize(config);
+ }
+
+
+ public void initialize(InputStream configFileStream)
throws InitializationException {
try {
@@ -77,7 +88,7 @@ public class HtmlExtractionRegistry {
XPath xPath = factory.newXPath();
DocumentBuilder parser =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
- Document document = parser.parse(new
InputSource(getClass().getClassLoader().getResourceAsStream(configFileName)));
+ Document document = parser.parse(new
InputSource(configFileStream));
Node node;
NodeList nodes = (NodeList)
xPath.evaluate("/htmlextractors/extractor", document, XPathConstants.NODESET);
if (nodes != null) {
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
Fri Jul 22 08:11:22 2011
@@ -17,6 +17,8 @@
package org.apache.stanbol.enhancer.engines.metaxa.core.html;
import org.semanticdesktop.aperture.extractor.Extractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* HtmlExtractorFactory.java
@@ -27,9 +29,26 @@ import org.semanticdesktop.aperture.extr
public class HtmlExtractorFactory extends
org.semanticdesktop.aperture.extractor.html.HtmlExtractorFactory {
+ private static final Logger LOG =
LoggerFactory.getLogger(HtmlExtractorFactory.class);
+
+ public static String REGISTRY_CONFIGURATION = "htmlextractors.xml";
+ private HtmlExtractionRegistry registry;
+ private HtmlParser parser;
+
+ public HtmlExtractorFactory() throws InstantiationException {
+ this.parser = new HtmlParser();
+ try {
+ registry = new HtmlExtractionRegistry(REGISTRY_CONFIGURATION);
+ }
+ catch (InitializationException e) {
+ LOG.error("Registry Initialization Error: " + e.getMessage());
+ throw new InstantiationException(e.getMessage());
+ }
+ }
+
@Override
public Extractor get() {
- return new IksHtmlExtractor();
+ return new IksHtmlExtractor(registry, parser);
}
}
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
Fri Jul 22 08:11:22 2011
@@ -22,7 +22,6 @@ import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import org.htmlcleaner.CleanerProperties;
-import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -43,7 +42,7 @@ public class HtmlParser {
private HtmlCleaner htmlToXmlParser;
private CleanerProperties parserProps;
- private DomSerializer domCreator;
+ private DomSerializer2 domCreator;
public HtmlParser() {
@@ -51,11 +50,11 @@ public class HtmlParser {
this.parserProps = this.htmlToXmlParser.getProperties();
this.parserProps.setRecognizeUnicodeChars(true);
this.parserProps.setUseEmptyElementTags(true);
- // this.parserProps.setAdvancedXmlEscape(true);
+ this.parserProps.setAdvancedXmlEscape(true);
this.parserProps.setTranslateSpecialEntities(true);
this.parserProps.setOmitComments(true);
this.parserProps.setPruneTags("script,style,form,map,noscript");
- this.domCreator = new DomSerializer(this.parserProps);
+ this.domCreator = new DomSerializer2(this.parserProps,true);
// TODO override otpions form config
}
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
Fri Jul 22 08:11:22 2011
@@ -48,32 +48,38 @@ public class IksHtmlExtractor implements
public static String DEFAULT_CONFIGURATION = "htmlextractors.xml";
- private static HtmlParser htmlParser = new HtmlParser();
+ private HtmlParser htmlParser;
- private static HtmlExtractionRegistry registry =
- new HtmlExtractionRegistry();
- static {
+ public HtmlExtractionRegistry registry = null;
+
+ public IksHtmlExtractor() {
+ // lazy initialization when used first
+ if (registry == null) {
try {
- registry.initialize(DEFAULT_CONFIGURATION);
+ this.htmlParser = new HtmlParser();
+ this.registry = new HtmlExtractionRegistry(DEFAULT_CONFIGURATION);
} catch (InitializationException e) {
- LOG.error("Registration Initialization Error: " + e.getMessage());
+ LOG.error("Registry Initialization Error: " + e.getMessage());
}
+ }
}
-
- public IksHtmlExtractor() {
+ public IksHtmlExtractor(HtmlExtractionRegistry registry, HtmlParser
parser) {
+ this.registry = registry;
+ this.htmlParser = parser;
}
-
+
public IksHtmlExtractor(String configFileName)
throws InitializationException {
- this();
- registry = new HtmlExtractionRegistry(configFileName);
+ this.htmlParser = new HtmlParser();
+ this.registry = new HtmlExtractionRegistry(configFileName);
}
public void extract(URI id,
InputStream input, Charset charset, String mimeType,
RDFContainer result)
throws ExtractorException {
-
+ if (registry == null)
+ return;
String encoding;
if (charset == null) {
if (!input.markSupported()) {