svn commit: r1149493 - in /incubator/stanbol/trunk/enhancer/engines/metaxa: ./ src/main/java/org/apache/stanbol/enhancer/engines/metaxa/ src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/

wkasper Fri, 22 Jul 2011 01:12:00 -0700

Author: wkasper
Date: Fri Jul 22 08:11:22 2011
New Revision: 1149493

URL: http://svn.apache.org/viewvc?rev=1149493&view=rev
Log:
Stanbol-212: Enable use of other configurations and extractors
Moved all patches to the DomBuilder of HtmlCleaner-2.1 to a new subclass to 
remove the dependency on a patch release.


Added:
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
Modified:
    incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java

Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/README.md?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/README.md (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/README.md Fri Jul 22 
08:11:22 2011
@@ -270,6 +270,18 @@ The following table describes which voca
     </tr>
 </table>
 
+## Configuration options
+
+By default, Metaxa uses the extractors specified in the resource 
"extractionregistry.xml", and for HTML pages, the resource "htmlregistry.xml".
+Alternative configurations and extractors can be attached to Metaxa as 
fragment bundles, specifying as host bundle
+
+    Fragment-Host: org.apache.stanbol.enhancer.engines.metaxa
+
+The alternative configuration files then can be set as values of the properties
+
+* 
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.extractionregistry</pre></code>
+
+* 
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.htmlextractors</pre></code>
 
 ## Usage
 

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
 Fri Jul 22 08:11:22 2011
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
 
 import java.io.IOException;
 import java.util.Collections;
+import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -32,10 +33,12 @@ import org.apache.clerezza.rdf.core.impl
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl;
 import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.enhancer.engines.metaxa.core.MetaxaCore;
 import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils;
 import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver;
+import 
org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
@@ -50,6 +53,7 @@ import org.ontoware.rdf2go.model.node.Da
 import org.ontoware.rdf2go.model.node.Node;
 import org.ontoware.rdf2go.model.node.PlainLiteral;
 import org.ontoware.rdf2go.model.node.URI;
+import org.osgi.framework.BundleContext;
 import org.osgi.service.component.ComponentContext;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.slf4j.Logger;
@@ -62,13 +66,12 @@ import org.slf4j.LoggerFactory;
  * @author Joerg Steffen, DFKI
  * @version $Id$
  */
-@Component(immediate = true, metatype = true)
+@Component(immediate = true, metatype = true,
+    label="Apache Stanbol Text and Metadata Extraction Engine",
+    description="Extract plain text and embedded metadata form various 
document types and formats")
 @Service
 public class MetaxaEngine implements EnhancementEngine, ServiceProperties {
 
-    /**
-     * This contains the logger.
-     */
     private static final Logger log = 
LoggerFactory.getLogger(MetaxaEngine.class);
 
     /**
@@ -78,10 +81,27 @@ public class MetaxaEngine implements Enh
     public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
 
     /**
-     * This contains the Aperture extractor.
+     * name of a file defining the available docuemnt extractors for Metaxa. 
By defualt, the builtin file 'extractionregistry.xml' is used.
+     */
+    @Property(label="ExtractorRegistry",
+        description="The path of a resource on the bundle classpath that 
specifies which extractors to use.",
+        value="extractionregistry.xml")
+    public static final String GLOBAL_EXTRACTOR_REGISTRY = 
"org.apache.stanbol.enhancer.engines.metaxa.extractionregistry";
+
+    /**
+     * name of a file that defines the set of extractors for HTML documents. 
By default, the builtin file 'htmlextractors.xml' is used."
      */
+    @Property(label="HtmlExtractors",value="htmlextractors.xml",
+        description="The path of a resource on the bundle classpath that 
specifies which extractors are used for HTML pages.")
+    public static final String HTML_EXTRACTOR_REGISTRY = 
"org.apache.stanbol.enhancer.engines.metaxa.htmlextractors";
+
     private MetaxaCore extractor;
+    
+    BundleContext bundleContext;
 
+    public static final String DEFAULT_EXTRACTION_REGISTRY = 
"extractionregistry.xml";
+    public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = 
"htmlextractors.xml";
+    
     /**
      * The activate method.
      *
@@ -89,13 +109,27 @@ public class MetaxaEngine implements Enh
      * @throws IOException if initializing fails
      */
     protected void activate(ComponentContext ce) throws IOException {
-
-        try {
-            this.extractor = new MetaxaCore("extractionregistry.xml");
-            BundleURIResolver.BUNDLE = ce.getBundleContext().getBundle();
-        } catch (IOException e) {
-            log.error(e.getLocalizedMessage(), e);
-            throw e;
+        String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY;
+        String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
+        if (ce != null) {
+            this.bundleContext = ce.getBundleContext();
+            BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
+            try {
+                Dictionary<String, String> properties = ce.getProperties();
+                String confFile = properties.get(GLOBAL_EXTRACTOR_REGISTRY);
+                if (confFile != null && confFile.trim().length() > 0) {
+                    extractionRegistry = confFile;
+                }
+                confFile = properties.get(HTML_EXTRACTOR_REGISTRY);
+                if (confFile != null && confFile.trim().length() > 0) {
+                    htmlExtractors = confFile;
+                }
+                this.extractor = new MetaxaCore(extractionRegistry);
+                HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
+            } catch (IOException e) {
+                log.error(e.getLocalizedMessage(), e);
+                throw e;
+            }
         }
     }
 
@@ -119,13 +153,6 @@ public class MetaxaEngine implements Enh
     public void computeEnhancements(ContentItem ci) throws EngineException {
 
         try {
-            // get the model where to add the statements
-            MGraph g = ci.getMetadata();
-            // create enhancement
-            UriRef textEnhancement = 
EnhancementEngineHelper.createTextEnhancement(ci, this);
-            // set confidence value to 1.0
-            LiteralFactory literalFactory = LiteralFactory.getInstance();
-            g.add(new TripleImpl(textEnhancement, 
Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
             // get model from the extraction
             Model m = this.extractor.extract(ci.getStream(), ci.getId(), 
ci.getMimeType());
             // add the statements from this model to the Metadata model
@@ -134,6 +161,13 @@ public class MetaxaEngine implements Enh
                String text = MetaxaCore.getText(m);
                log.info(text);
                 */
+                // get the model where to add the statements
+                MGraph g = ci.getMetadata();
+                // create enhancement
+                UriRef textEnhancement = 
EnhancementEngineHelper.createTextEnhancement(ci, this);
+                // set confidence value to 1.0
+                LiteralFactory literalFactory = LiteralFactory.getInstance();
+                g.add(new TripleImpl(textEnhancement, 
Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
                 RDF2GoUtils.urifyBlankNodes(m);
                 HashMap<BlankNode, BNode> blankNodeMap = new 
HashMap<BlankNode, BNode>();
                 ClosableIterator<Statement> it = m.iterator();
@@ -147,10 +181,11 @@ public class MetaxaEngine implements Enh
                     if (null != subject && null != predicate && null != 
object) {
                         Triple t = new TripleImpl(subject, predicate, object);
                         g.add(t);
-                        log.info("added " + t.toString());
+                        log.debug("added " + t.toString());
                     }
                 }
                 it.close();
+                m.close();
             }
         } catch (ExtractorException e) {
             throw new EngineException(e.getLocalizedMessage(), e);

Added: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java?rev=1149493&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/DomSerializer2.java
 Fri Jul 22 08:11:22 2011
@@ -0,0 +1,375 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.html;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.CommentToken;
+import org.htmlcleaner.ContentToken;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.TagNode;
+import org.htmlcleaner.Utils;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+/**
+ *  Patches for HtmlCleaner-2.1 for namespace handling and correcting XML 
serialization.
+ *  The patches are not applicable for HtmlCleaner-2.2 that suffers from 
losing namespaces altogether.
+ *
+ * @author <a href="mailto:[email protected]";>Walter Kasper</a>
+ * 
+ */
+
+public class DomSerializer2 extends DomSerializer {
+
+  public DomSerializer2(CleanerProperties props, boolean escapeXml) {
+    super(props,escapeXml);
+  }
+  
+  public DomSerializer2(CleanerProperties props) {
+    this(props, true);
+  }
+
+  public Document createDOM(TagNode rootNode) throws 
ParserConfigurationException {
+    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+    factory.setNamespaceAware(props.isNamespacesAware());
+    Document document = factory.newDocumentBuilder().newDocument();
+    Element rootElement = document.createElement(rootNode.getName());;
+    document.appendChild(rootElement);
+    setAttributes(rootNode, rootElement);
+
+    createSubnodes(document, rootElement, rootNode.getChildren());
+
+    return document;
+}
+ 
+  private void setAttributes(TagNode node, Element element) {
+    Map<?,?> attributes =  node.getAttributes();
+    Iterator<?> entryIterator = attributes.entrySet().iterator();
+    while (entryIterator.hasNext()) {
+      Map.Entry<?,?> entry = (Map.Entry<?,?>) entryIterator.next();
+      String attrName = (String)entry.getKey();
+      String attrValue = (String)entry.getValue();
+      if (escapeXml) {
+        attrValue = escapeXml(attrValue, props, true);
+      }
+      // avoid xhtml declarations
+      if (!attrName.equals("xmlns")) {
+        element.setAttribute(attrName, attrValue);
+      }
+    }
+  }
+  
+  private void createSubnodes(Document document, Element element, List 
tagChildren) {
+    if (tagChildren != null) {
+      Iterator it = tagChildren.iterator();
+      while (it.hasNext()) {
+        Object item = it.next();
+        if (item instanceof CommentToken) {
+          CommentToken commentNode = (CommentToken) item;
+          Comment comment = document.createComment( 
commentNode.getContent().toString() );
+          element.appendChild(comment);
+        } else if (item instanceof ContentToken) {
+          ContentToken contentToken = (ContentToken) item;
+          String content = contentToken.getContent();
+          String nodeName = element.getNodeName();
+          boolean specialCase = props.isUseCdataForScriptAndStyle() &&
+          ("script".equalsIgnoreCase(nodeName) || 
"style".equalsIgnoreCase(nodeName));
+          if (escapeXml && !specialCase) {
+            content = escapeXml(content, props, true);
+          }
+          element.appendChild( specialCase ? 
document.createCDATASection(content) : document.createTextNode(content) );
+        } else if (item instanceof TagNode) {
+          TagNode subTagNode = (TagNode) item;
+          Element subelement = document.createElement( subTagNode.getName() );;
+          
+          setAttributes(subTagNode, subelement);
+          
+          // recursively create subnodes
+          createSubnodes(document, subelement, subTagNode.getChildren());
+          
+          element.appendChild(subelement);
+        } else if (item instanceof List) {
+          List sublist = (List) item;
+          createSubnodes(document, element, sublist);
+        }
+      }
+    }
+  }
+    
+  /**
+   * Escapes XML string.
+   * @param s String to be escaped
+   * @param props Cleaner properties gover affect escaping behaviour
+   * @param isDomCreation Tells if escaped content will be part of the DOM
+   */
+  public static String escapeXml(String s, CleanerProperties props, boolean 
isDomCreation) {
+      boolean advanced = props.isAdvancedXmlEscape();
+      boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
+      boolean translateSpecialEntities = props.isTranslateSpecialEntities();
+
+      if (s != null) {
+      int len = s.length();
+      StringBuffer result = new StringBuffer(len);
+      
+      for (int i = 0; i < len; i++) {
+        char ch = s.charAt(i);
+        
+        if (ch == '&') {
+          if ( (advanced || recognizeUnicodeChars) && (i < len-1) && 
(s.charAt(i+1) == '#') ) {
+            int charIndex = i + 2;
+            String unicode = "";
+            while ( charIndex < len &&
+                              (Utils.isHexadecimalDigit(s.charAt(charIndex)) 
|| s.charAt(charIndex) == 'x' || s.charAt(charIndex) == 'X') 
+                            ) {
+              unicode += s.charAt(charIndex);
+              charIndex++;
+            }
+            if (charIndex == len || !"".equals(unicode)) {
+              try {
+                char unicodeChar = unicode.toLowerCase().startsWith("x") ?
+                                                      
(char)Integer.parseInt(unicode.substring(1), 16) :                              
  
+                                                      
(char)Integer.parseInt(unicode);
+//                 if ( "&<>\'\"".indexOf(unicodeChar) < 0 ) {
+                  int replaceChunkSize = (charIndex < len && 
s.charAt(charIndex) == ';') ? unicode.length()+1 : unicode.length();
+                  result.append( recognizeUnicodeChars ? 
String.valueOf(unicodeChar) : "&#" + unicode + ";" );
+                  i += replaceChunkSize + 1;
+//                } else {
+//                    i = charIndex;
+//                    result.append("&#" + unicode + ";");
+//                }
+              } catch (NumberFormatException e) {
+                i = charIndex;
+                result.append("&#" + unicode + ";");
+              }
+            } else {
+              result.append("&");
+            }
+          } else {
+            if (translateSpecialEntities) {
+              // get following sequence of most 10 characters
+              String seq = s.substring(i, i+Math.min(10, len-i));
+              int semiIndex = seq.indexOf(';');
+              if (semiIndex > 0) {
+                String entity = seq.substring(1, semiIndex);
+                Integer code = entities.get(entity);
+                if (code != null) {
+                  int entityLen = entity.length();
+                                  if (recognizeUnicodeChars) {
+                                      result.append( (char)code.intValue() );
+                                  } else {
+                                      result.append( "&#" + code + ";" );
+                                  }
+                  i += entityLen + 1;
+                  continue;
+                }
+              }
+            }
+            
+            if (advanced) {
+                          String sub = s.substring(i);
+                          if ( sub.startsWith("&amp;") ) {
+                              result.append(isDomCreation ? "&" : "&amp;");
+                              i += 4;
+                          } else if ( sub.startsWith("&apos;") ) {
+                              result.append(isDomCreation ? "'" : "&apos;");
+                              i += 5;
+                          } else if ( sub.startsWith("&gt;") ) {
+                              result.append(isDomCreation ? ">" : "&gt;");
+                              i += 3;
+                          } else if ( sub.startsWith("&lt;") ) {
+                              result.append(isDomCreation ? "<" : "&lt;");
+                              i += 3;
+                          } else if ( sub.startsWith("&quot;") ) {
+                              result.append(isDomCreation ? "\"" : "&quot;");
+                              i += 5;
+                          } else {
+                              result.append(isDomCreation ? "&" : "&amp;");
+                          }
+              
+              continue;
+            }
+            
+            result.append("&");
+          }
+        }
+        else if (!isDomCreation) {
+          if (ch == '\'') {
+            result.append("&apos;");
+          } else if (ch == '>') {
+            result.append("&gt;");
+          } else if (ch == '<') {
+            result.append("&lt;");
+          } else if (ch == '\"') {
+            result.append("&quot;");
+          }
+        }
+        else {
+          result.append(ch);
+        }
+      }
+      return result.toString();
+    }
+    
+    return null;
+  }
+
+  // copied from SpecialEntities class because map is not visible only package 
internal
+  public static Map<String,Integer> entities = new HashMap<String,Integer>();
+
+  static {
+    entities.put("nbsp",  new Integer(160));
+    entities.put("iexcl", new Integer(161));
+    entities.put("curren",  new Integer(164));
+    entities.put("cent",  new Integer(162));
+    entities.put("pound", new Integer(163));
+    entities.put("yen",   new Integer(165));
+    entities.put("brvbar",  new Integer(166));
+    entities.put("sect",  new Integer(167));
+    entities.put("uml",   new Integer(168));
+    entities.put("copy",  new Integer(169));
+    entities.put("ordf",  new Integer(170));
+    entities.put("laquo", new Integer(171));
+    entities.put("not",   new Integer(172));
+    entities.put("shy",   new Integer(173));
+    entities.put("reg",   new Integer(174));
+    entities.put("trade", new Integer(8482));
+    entities.put("macr",  new Integer(175));
+    entities.put("deg",   new Integer(176));
+    entities.put("plusmn",  new Integer(177));
+    entities.put("sup2",  new Integer(178));
+    entities.put("sup3",  new Integer(179));
+    entities.put("acute", new Integer(180));
+    entities.put("micro", new Integer(181));
+    entities.put("para",  new Integer(182));
+    entities.put("middot",  new Integer(183));
+    entities.put("cedil", new Integer(184));
+    entities.put("sup1",  new Integer(185));
+    entities.put("ordm",  new Integer(186));
+    entities.put("raquo", new Integer(187));
+    entities.put("frac14",  new Integer(188));
+    entities.put("frac12",  new Integer(189));
+    entities.put("frac34",  new Integer(190));
+    entities.put("iquest",  new Integer(191));
+    entities.put("times", new Integer(215));
+    entities.put("divide",  new Integer(247));
+
+    entities.put("Agrave",  new Integer(192));
+    entities.put("Aacute",  new Integer(193));
+    entities.put("Acirc", new Integer(194));
+    entities.put("Atilde",  new Integer(195));
+    entities.put("Auml",  new Integer(196));
+    entities.put("Aring", new Integer(197));
+    entities.put("AElig", new Integer(198));
+    entities.put("Ccedil",  new Integer(199));
+    entities.put("Egrave",  new Integer(200));
+    entities.put("Eacute",  new Integer(201));
+    entities.put("Ecirc", new Integer(202));
+    entities.put("Euml",  new Integer(203));
+    entities.put("Igrave",  new Integer(204));
+    entities.put("Iacute",  new Integer(205));
+    entities.put("Icirc", new Integer(206));
+    entities.put("Iuml",  new Integer(207));
+    entities.put("ETH",   new Integer(208));
+    entities.put("Ntilde",  new Integer(209));
+    entities.put("Ograve",  new Integer(210));
+    entities.put("Oacute",  new Integer(211));
+    entities.put("Ocirc", new Integer(212));
+    entities.put("Otilde",  new Integer(213));
+    entities.put("Ouml",  new Integer(214));
+    entities.put("Oslash",  new Integer(216));
+    entities.put("Ugrave",  new Integer(217));
+    entities.put("Uacute",  new Integer(218));
+    entities.put("Ucirc", new Integer(219));
+    entities.put("Uuml",  new Integer(220));
+    entities.put("Yacute",  new Integer(221));
+    entities.put("THORN", new Integer(222));
+    entities.put("szlig", new Integer(223));
+    entities.put("agrave",  new Integer(224));
+    entities.put("aacute",  new Integer(225));
+    entities.put("acirc", new Integer(226));
+    entities.put("atilde",  new Integer(227));
+    entities.put("auml",  new Integer(228));
+    entities.put("aring", new Integer(229));
+    entities.put("aelig", new Integer(230));
+    entities.put("ccedil",  new Integer(231));
+    entities.put("egrave",  new Integer(232));
+    entities.put("eacute",  new Integer(233));
+    entities.put("ecirc", new Integer(234));
+    entities.put("euml",  new Integer(235));
+    entities.put("igrave",  new Integer(236));
+    entities.put("iacute",  new Integer(237));
+    entities.put("icirc", new Integer(238));
+    entities.put("iuml",  new Integer(239));
+    entities.put("eth",   new Integer(240));
+    entities.put("ntilde",  new Integer(241));
+    entities.put("ograve",  new Integer(242));
+    entities.put("oacute",  new Integer(243));
+    entities.put("ocirc", new Integer(244));
+    entities.put("otilde",  new Integer(245));
+    entities.put("ouml",  new Integer(246));
+    entities.put("oslash",  new Integer(248));
+    entities.put("ugrave",  new Integer(249));
+    entities.put("uacute",  new Integer(250));
+    entities.put("ucirc", new Integer(251));
+    entities.put("uuml",  new Integer(252));
+    entities.put("yacute",  new Integer(253));
+    entities.put("thorn", new Integer(254));
+    entities.put("yuml",  new Integer(255));
+
+    entities.put("OElig", new Integer(338));
+    entities.put("oelig", new Integer(339));
+    entities.put("Scaron",  new Integer(352));
+    entities.put("scaron",  new Integer(353));
+    entities.put("Yuml",  new Integer(376));
+    entities.put("circ",  new Integer(710));
+    entities.put("tilde", new Integer(732));
+    entities.put("ensp",  new Integer(8194));
+    entities.put("emsp",  new Integer(8195));
+    entities.put("thinsp",  new Integer(8201));
+    entities.put("zwnj",  new Integer(8204));
+    entities.put("zwj",   new Integer(8205));
+    entities.put("lrm",   new Integer(8206));
+    entities.put("rlm",   new Integer(8207));
+    entities.put("ndash", new Integer(8211));
+    entities.put("mdash", new Integer(8212));
+    entities.put("lsquo", new Integer(8216));
+    entities.put("rsquo", new Integer(8217));
+    entities.put("sbquo", new Integer(8218));
+    entities.put("ldquo", new Integer(8220));
+    entities.put("rdquo", new Integer(8221));
+    entities.put("bdquo", new Integer(8222));
+    entities.put("dagger",  new Integer(8224));
+    entities.put("Dagger",  new Integer(8225));
+    entities.put("hellip",  new Integer(8230));
+    entities.put("permil",  new Integer(8240));
+    entities.put("lsaquo",  new Integer(8249));
+    entities.put("rsaquo",  new Integer(8250));
+    entities.put("euro",  new Integer(8364));
+  }
+
+  
+}

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractionRegistry.java
 Fri Jul 22 08:11:22 2011
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;
@@ -66,10 +67,20 @@ public class HtmlExtractionRegistry {
     public HtmlExtractionRegistry(String configFileName)
             throws InitializationException {
         this();
-        initialize(configFileName);
+        InputStream config = 
getClass().getClassLoader().getResourceAsStream(configFileName);
+        if (config == null) {
+            throw new InitializationException("File not found: 
"+configFileName);
+        }
+        initialize(config);
     }
 
-    public void initialize(String configFileName)
+    public HtmlExtractionRegistry(InputStream config) throws 
InitializationException {
+        this();
+        initialize(config);
+    }
+    
+    
+    public void initialize(InputStream configFileStream)
             throws InitializationException {
 
         try {
@@ -77,7 +88,7 @@ public class HtmlExtractionRegistry {
             XPath xPath = factory.newXPath();
             DocumentBuilder parser =
                 DocumentBuilderFactory.newInstance().newDocumentBuilder();
-            Document document = parser.parse(new 
InputSource(getClass().getClassLoader().getResourceAsStream(configFileName)));
+            Document document = parser.parse(new 
InputSource(configFileStream));
             Node node;
             NodeList nodes = (NodeList) 
xPath.evaluate("/htmlextractors/extractor", document, XPathConstants.NODESET);
             if (nodes != null) {

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlExtractorFactory.java
 Fri Jul 22 08:11:22 2011
@@ -17,6 +17,8 @@
 package org.apache.stanbol.enhancer.engines.metaxa.core.html;
 
 import org.semanticdesktop.aperture.extractor.Extractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * HtmlExtractorFactory.java
@@ -27,9 +29,26 @@ import org.semanticdesktop.aperture.extr
 public class HtmlExtractorFactory extends
         org.semanticdesktop.aperture.extractor.html.HtmlExtractorFactory {
 
+    private static final Logger LOG = 
LoggerFactory.getLogger(HtmlExtractorFactory.class);
+    
+    public static String REGISTRY_CONFIGURATION = "htmlextractors.xml";
+    private HtmlExtractionRegistry registry;
+    private HtmlParser parser;
+
+    public HtmlExtractorFactory() throws InstantiationException {
+        this.parser = new HtmlParser();
+        try {
+            registry = new HtmlExtractionRegistry(REGISTRY_CONFIGURATION);
+        }
+        catch (InitializationException e) {
+            LOG.error("Registry Initialization Error: " + e.getMessage());
+            throw new InstantiationException(e.getMessage());
+        }
+    }
+    
     @Override
     public Extractor get() {
-        return new IksHtmlExtractor();
+        return new IksHtmlExtractor(registry, parser);
     }
 
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlParser.java
 Fri Jul 22 08:11:22 2011
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import javax.xml.parsers.ParserConfigurationException;
 
 import org.htmlcleaner.CleanerProperties;
-import org.htmlcleaner.DomSerializer;
 import org.htmlcleaner.HtmlCleaner;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -43,7 +42,7 @@ public class HtmlParser {
 
     private HtmlCleaner htmlToXmlParser;
     private CleanerProperties parserProps;
-    private DomSerializer domCreator;
+    private DomSerializer2 domCreator;
 
 
     public HtmlParser() {
@@ -51,11 +50,11 @@ public class HtmlParser {
         this.parserProps = this.htmlToXmlParser.getProperties();
         this.parserProps.setRecognizeUnicodeChars(true);
         this.parserProps.setUseEmptyElementTags(true);
-        // this.parserProps.setAdvancedXmlEscape(true);
+        this.parserProps.setAdvancedXmlEscape(true);
         this.parserProps.setTranslateSpecialEntities(true);
         this.parserProps.setOmitComments(true);
         this.parserProps.setPruneTags("script,style,form,map,noscript");
-        this.domCreator = new DomSerializer(this.parserProps);
+        this.domCreator = new DomSerializer2(this.parserProps,true);
         // TODO override otpions form config
     }
 

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java?rev=1149493&r1=1149492&r2=1149493&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
 Fri Jul 22 08:11:22 2011
@@ -48,32 +48,38 @@ public class IksHtmlExtractor implements
 
     public static String DEFAULT_CONFIGURATION = "htmlextractors.xml";
 
-    private static HtmlParser htmlParser = new HtmlParser();
+    private HtmlParser htmlParser;
 
-    private static HtmlExtractionRegistry registry =
-        new HtmlExtractionRegistry();
-    static {
+    public HtmlExtractionRegistry registry = null;
+
+    public IksHtmlExtractor() {
+      // lazy initialization when used first
+      if (registry == null) {
         try {
-            registry.initialize(DEFAULT_CONFIGURATION);
+            this.htmlParser = new HtmlParser();
+            this.registry = new HtmlExtractionRegistry(DEFAULT_CONFIGURATION);
         } catch (InitializationException e) {
-            LOG.error("Registration Initialization Error: " + e.getMessage());
+          LOG.error("Registry Initialization Error: " + e.getMessage());
         }
+      }
     }
-
-    public IksHtmlExtractor() {
+    public IksHtmlExtractor(HtmlExtractionRegistry registry, HtmlParser 
parser) {
+        this.registry = registry;
+        this.htmlParser = parser;
     }
-
+    
     public IksHtmlExtractor(String configFileName)
             throws InitializationException {
-        this();
-        registry = new HtmlExtractionRegistry(configFileName);
+        this.htmlParser = new HtmlParser();
+        this.registry = new HtmlExtractionRegistry(configFileName);
     }
 
     public void extract(URI id,
             InputStream input, Charset charset, String mimeType,
             RDFContainer result)
             throws ExtractorException {
-
+        if (registry == null)
+            return;
         String encoding;
         if (charset == null) {
             if (!input.markSupported()) {

svn commit: r1149493 - in /incubator/stanbol/trunk/enhancer/engines/metaxa: ./ src/main/java/org/apache/stanbol/enhancer/engines/metaxa/ src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/

Reply via email to