Author: wkasper
Date: Mon Nov 7 09:13:17 2011
New Revision: 1198660
URL: http://svn.apache.org/viewvc?rev=1198660&view=rev
Log:
Added utility class for core HTML text extraction
some cleanup and changes in logging level to debug
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlTextExtractUtil.java
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/CharsetRecognizer.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/XsltExtractor.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmlmetadata.xsl
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmltextextract.xsl
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/CharsetRecognizer.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/CharsetRecognizer.java?rev=1198660&r1=1198659&r2=1198660&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/CharsetRecognizer.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/CharsetRecognizer.java
Mon Nov 7 09:13:17 2011
@@ -83,7 +83,7 @@ public class CharsetRecognizer {
}
if (result != null) {
result = result.toUpperCase();
- LOG.info(format.toUpperCase() + " encoding: " + result);
+ LOG.debug(format.toUpperCase() + " encoding: " + result);
}
else {
return defaultValue;
@@ -122,7 +122,7 @@ public class CharsetRecognizer {
detector.setText(in);
CharsetMatch found = detector.detect();
result = found.getName();
- LOG.info("Encoding: " + result);
+ LOG.debug("Encoding: " + result);
return result;
}
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlTextExtractUtil.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlTextExtractUtil.java?rev=1198660&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlTextExtractUtil.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlTextExtractUtil.java
Mon Nov 7 09:13:17 2011
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.transform.TransformerFactory;
+
+import org.ontoware.aifbcommons.collection.ClosableIterator;
+import org.ontoware.rdf2go.model.Model;
+import org.ontoware.rdf2go.model.Statement;
+import org.ontoware.rdf2go.model.Syntax;
+import org.ontoware.rdf2go.model.node.URI;
+import org.ontoware.rdf2go.model.node.Variable;
+import org.ontoware.rdf2go.util.RDFTool;
+import org.semanticdesktop.aperture.extractor.ExtractorException;
+import org.semanticdesktop.aperture.rdf.RDFContainer;
+import org.semanticdesktop.aperture.vocabulary.NCO;
+import org.semanticdesktop.aperture.vocabulary.NIE;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+
+/**
+ * Utility class that provides core HTML text and metadata extraction
independent of the configuration of Metaxa's main HTML extractor
+ *
+ * @author <a href="mailto:[email protected]">Walter Kasper</a>
+ *
+ */
+
+public class HtmlTextExtractUtil {
+ private static final Logger LOG =
LoggerFactory.getLogger(HtmlTextExtractUtil.class);
+
+ private static HtmlParser htmlParser = new HtmlParser();
+ private static XsltExtractor htmlExtractor;
+
+ public HtmlTextExtractUtil() throws InitializationException {
+ if (HtmlTextExtractUtil.htmlExtractor == null) {
+ TransformerFactory transFac = TransformerFactory.newInstance();
+ transFac.setURIResolver(new BundleURIResolver());
+ HtmlTextExtractUtil.htmlExtractor = new XsltExtractor("any",
"xslt/htmlmetadata.xsl", transFac);
+ HtmlTextExtractUtil.htmlExtractor.setSyntax(Syntax.RdfXml);
+ }
+ }
+
+ public String getTitle(Model meta) {
+ Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.title,
Variable.ANY);
+ if (stmt != null) {
+ return stmt.getObject().toString();
+ }
+ return null;
+ }
+
+ public String getAuthor(Model meta) {
+ Statement stmt = RDFTool.findStatement(meta, Variable.ANY,
NCO.creator, Variable.ANY);
+ if (stmt != null) {
+ stmt = RDFTool.findStatement(meta, stmt.getSubject(),
NCO.fullname, Variable.ANY);
+ if (stmt != null) {
+ return stmt.getObject().toString();
+ }
+ }
+ return null;
+ }
+
+ public String getDescription(Model meta) {
+ Statement stmt = RDFTool.findStatement(meta, Variable.ANY,
NIE.description, Variable.ANY);
+ if (stmt != null) {
+ return stmt.getObject().toString();
+ }
+ return null;
+ }
+
+ public List<String> getKeywords(Model meta) {
+ List<String> kws = new ArrayList<String>();
+ ClosableIterator<Statement> it = meta.findStatements(Variable.ANY,
NIE.keyword, Variable.ANY);
+ while (it.hasNext()) {
+ kws.add(it.next().getObject().toString());
+ }
+ it.close();
+ return kws;
+ }
+
+ public String getText(Model meta) {
+ Statement stmt = RDFTool.findStatement(meta, Variable.ANY,
NIE.plainTextContent, Variable.ANY);
+ if (stmt != null) {
+ return stmt.getObject().toString();
+ }
+ return null;
+ }
+
+ public void extract(URI id, String charset, InputStream input,
RDFContainer result) throws ExtractorException {
+ String encoding = charset;
+ if (charset == null) {
+ try {
+ encoding = CharsetRecognizer.detect(input, "html", null);
+ } catch (IOException e) {
+ LOG.error("Charset detection problem: " + e.getMessage());
+ throw new ExtractorException("Charset detection problem: " +
e.getMessage());
+ }
+ }
+ Document doc = htmlParser.getDOM(input, encoding);
+ htmlExtractor.extract(id.toString(), doc, null, result);
+ }
+
+}
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java?rev=1198660&r1=1198659&r2=1198660&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/IksHtmlExtractor.java
Mon Nov 7 09:13:17 2011
@@ -115,7 +115,7 @@ public class IksHtmlExtractor implements
List<String> formats = new ArrayList<String>();
long modelSize = result.getModel().size();
for (String s : registry.getActiveExtractors()) {
- LOG.info("Extractor: " + s);
+ LOG.debug("Extractor: {}", s);
HtmlExtractionComponent extractor = extractors.get(s);
// TODO: Handle dependencies between Microformat extractors, e.g.
// formats used also in other formats
@@ -123,8 +123,7 @@ public class IksHtmlExtractor implements
extractor.extract(id.toString(), doc, null, result);
long tmpSize = result.getModel().size();
if (modelSize < tmpSize) {
- LOG.info((tmpSize - modelSize) + " Statements added: "
- + s);
+ LOG.debug("{} Statements added: {}",(tmpSize -
modelSize),s);
modelSize = tmpSize;
}
}
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/XsltExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/XsltExtractor.java?rev=1198660&r1=1198659&r2=1198660&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/XsltExtractor.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/XsltExtractor.java
Mon Nov 7 09:13:17 2011
@@ -60,7 +60,7 @@ public class XsltExtractor implements Ht
private Transformer transformer;
private String id;
private URI source;
- private Syntax syntax = XsltExtractor.N3;
+ private Syntax syntax = Syntax.RdfXml;
public XsltExtractor() {
@@ -134,7 +134,6 @@ public class XsltExtractor implements Ht
StreamResult output = new StreamResult(writer);
try {
this.transformer.transform(source, output);
- // TODO put results into the RDFContainer
String rdf = writer.toString();
LOG.debug(rdf);
StringReader reader = new StringReader(rdf);
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmlmetadata.xsl
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmlmetadata.xsl?rev=1198660&r1=1198659&r2=1198660&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmlmetadata.xsl
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmlmetadata.xsl
Mon Nov 7 09:13:17 2011
@@ -54,13 +54,13 @@
</xsl:template>
<xsl:template match="meta[@name='author']">
- <nie:creator>
+ <nco:creator>
<nco:Contact>
<nco:fullname>
<xsl:value-of
select="normalize-space(@content)"/>
</nco:fullname>
</nco:Contact>
- </nie:creator>
+ </nco:creator>
</xsl:template>
<xsl:template match="meta[@name='keywords']">
@@ -94,7 +94,16 @@
<xsl:text>
</xsl:text>
</xsl:template>
-
+
+ <xsl:template match="br" mode="textextract">
+ <xsl:text>
+</xsl:text>
+ </xsl:template>
+
+ <xsl:template match="pre" mode="textextract">
+ <xsl:value-of select="."/>
+ </xsl:template>
+
<xsl:template match="*" mode="textextract">
<xsl:apply-templates mode="textextract"/>
</xsl:template>
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmltextextract.xsl
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmltextextract.xsl?rev=1198660&r1=1198659&r2=1198660&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmltextextract.xsl
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/xslt/htmltextextract.xsl
Mon Nov 7 09:13:17 2011
@@ -57,6 +57,15 @@
</xsl:text>
</xsl:template>
+ <xsl:template match="br" mode="textextract">
+ <xsl:text>
+</xsl:text>
+ </xsl:template>
+
+ <xsl:template match="pre" mode="textextract">
+ <xsl:value-of select="."/>
+ </xsl:template>
+
<xsl:template match="*" mode="textextract">
<xsl:apply-templates mode="textextract"/>
</xsl:template>